diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0ec99c7..d8c37bbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
 list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")
 
 # Add also all the strategies
-file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
+file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")
 
 # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC
 list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c")
@@ -340,6 +340,9 @@ if(NOT DEFINED MSVC)
   if(NOT "test_external_symbols" IN_LIST XFAIL)
     add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
   endif()
+  if(NOT "test_mtt" IN_LIST XFAIL)
+    add_test( NAME test_mtt COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mtt.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
   if(NOT "test_intra" IN_LIST XFAIL)
     add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
   endif()
diff --git a/src/cabac.h b/src/cabac.h
index be249ba2..f38030a9 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -77,6 +77,8 @@ typedef struct
     cabac_ctx_t mts_idx_model[4];
     cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models
     cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models
+    cabac_ctx_t mtt_vertical_model[5]; 
+    cabac_ctx_t mtt_binary_model[4]; 
     cabac_ctx_t intra_luma_mpm_flag_model;    //!< \brief intra mode context models
     cabac_ctx_t intra_subpart_model[2];    //!< \brief intra sub part context models
     cabac_ctx_t chroma_pred_model;
diff --git a/src/cfg.c b/src/cfg.c
index cafadcb2..bf9e1307 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -80,7 +80,6 @@ int uvg_config_init(uvg_config *cfg)
   cfg->trskip_max_size = 2; //Default to 4x4
   cfg->mts             = 0;
   cfg->mts_implicit    = 0;
-  cfg->tr_depth_intra  = 0;
   cfg->ime_algorithm   = 0; /* hexbs */
   cfg->fme_level       = 4;
   cfg->source_scan_type = 0; /* progressive */
@@ -207,6 +206,8 @@ int uvg_config_init(uvg_config *cfg)
 
   cfg->lfnst = false;
 
+  cfg->isp = false;
+
   parse_qp_map(cfg, 0);
 
   cfg->jccr = 0;
@@ -221,10 +222,27 @@ int uvg_config_init(uvg_config *cfg)
   cfg->cabac_debug_file_name = NULL;
 
   cfg->dual_tree = 0;
+
+  cfg->min_qt_size[0] = 4;
+  cfg->min_qt_size[1] = 4;
+  cfg->min_qt_size[2] = 4;
+
+  cfg->max_btt_depth[0] = 0;
+  cfg->max_btt_depth[1] = 0;
+  cfg->max_btt_depth[2] = 0;
+
+  cfg->max_tt_size[0] = 64;
+  cfg->max_bt_size[0] = 64;
+  cfg->max_tt_size[1] = 64;
+  cfg->max_bt_size[1] = 64;
+  cfg->max_tt_size[2] = 64;
+  cfg->max_bt_size[2] = 64;
+
   cfg->intra_rough_search_levels = 2;
 
   cfg->ibc = 0;
 
+  cfg->dep_quant = 0;
   return 1;
 }
 
@@ -333,7 +351,7 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil
 
   return 1;
 }
-/*
+
 static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
 {
   char *tail;
@@ -349,7 +367,7 @@ static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
     return 1;
   }
 }
-*/
+
 static int parse_int8(const char *numstr,int8_t* number,int min, int max)
 {
   char *tail;
@@ -365,7 +383,7 @@ static int parse_int8(const char *numstr,int8_t* number,int min, int max)
     return 1;
   }
 }
-/*
+
 static int parse_array(const char *array, uint8_t *coeff_key, int size,
                             int min, int max)
 {
@@ -389,15 +407,15 @@ static int parse_array(const char *array, uint8_t *coeff_key, int size,
     free(key);
     return 0;
   }
-  else if (i<size){
-    fprintf(stderr, "parsing failed : too few members.\n");
-    free(key);
-    return 0;
-  }
+  //else if (i<size){
+  //  fprintf(stderr, "parsing failed : too few members.\n");
+  //  free(key);
+  //  return 0;
+  //}
   free(key);
-  return 1;
+  return i;
 }
-*/
+
 
 static int parse_qp_scale_array(const char *array, int8_t *out)
 {
@@ -928,8 +946,6 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
     cfg->mts = mts_type;
     cfg->mts_implicit = (mts_type == UVG_MTS_IMPLICIT);
   }
-  else if OPT("tr-depth-intra")
-    cfg->tr_depth_intra = atoi(value);
   else if OPT("me") {
     int8_t ime_algorithm = 0;
     if (!parse_enum(value, me_names, &ime_algorithm)) return 0;
@@ -1454,6 +1470,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
   else if OPT("lfnst") {
     cfg->lfnst = atobool(value);
   }
+  else if OPT("isp") {
+    cfg->isp = atobool(value);
+  }
   else if OPT("jccr") {
     cfg->jccr = (bool)atobool(value);
   }
@@ -1479,6 +1498,49 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
   else if OPT("dual-tree") {
     cfg->dual_tree = atobool(value);
   }
+  else if OPT("mtt-depth-intra") {
+    cfg->max_btt_depth[0]  = atoi(value);
+  }
+  else if OPT("mtt-depth-intra-chroma") {
+    cfg->max_btt_depth[2]  = atoi(value);
+  }
+  else if OPT("mtt-depth-inter") {
+    cfg->max_btt_depth[1]  = atoi(value);
+  }
+  else if OPT("max-bt-size") {
+  uint8_t sizes[3];
+  const int got = parse_array(value, sizes, 3, 0, 128);
+    if (got == 1) {
+      cfg->max_bt_size[0] = sizes[0];
+      cfg->max_bt_size[1] = sizes[0];
+      cfg->max_bt_size[2] = sizes[0];
+    }
+    else if (got == 3) {
+      cfg->max_bt_size[0] = sizes[0];
+      cfg->max_bt_size[1] = sizes[1];
+      cfg->max_bt_size[2] = sizes[2];      
+    } else {
+      fprintf(stderr, "Incorrect amount of values provided for max-bt-size\n");
+      return 0;
+    }
+  }
+  else if OPT("max-tt-size") {
+  uint8_t sizes[3];
+  const int got = parse_array(value, sizes, 3, 0, 128);
+    if (got == 1) {
+      cfg->max_tt_size[0] = sizes[0];
+      cfg->max_tt_size[1] = sizes[0];
+      cfg->max_tt_size[2] = sizes[0];
+    }
+    else if (got == 3) {
+      cfg->max_tt_size[0] = sizes[0];
+      cfg->max_tt_size[1] = sizes[1];
+      cfg->max_tt_size[2] = sizes[2];      
+    } else {
+      fprintf(stderr, "Incorrect amount of values provided for max-tt-size\n");
+      return 0;
+    }
+  }
   else if OPT("intra-rough-granularity") {
     cfg->intra_rough_search_levels = atoi(value);
   }
@@ -1489,7 +1551,11 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
       return 0;
     }
     cfg->ibc = (uint8_t)ibc_value;
-  }  else {
+  }
+  else if OPT("dep-quant") {
+    cfg->dep_quant = (bool)atobool(value);
+  }
+  else {
     return 0;
   }
 #undef OPT
@@ -1681,12 +1747,6 @@ int uvg_config_validate(const uvg_config *const cfg)
     error = 1;
   }
 
-  if (cfg->tr_depth_intra < 0 || cfg->tr_depth_intra > 4) {
-    // range is 0 .. CtbLog2SizeY - Log2MinTrafoSize
-    fprintf(stderr, "Input error: --tr-depth-intra is out of range [0..4]\n");
-    error = 1;
-  }
-
   if (cfg->fme_level != 0 && cfg->fme_level > 4) {
     fprintf(stderr, "Input error: invalid --subme parameter (must be in range 0-4)\n");
     error = 1;
diff --git a/src/cli.c b/src/cli.c
index fa6ee6df..6e66f77e 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -76,7 +76,6 @@ static const struct option long_options[] = {
   { "tr-skip-max-size",   required_argument, NULL, 0 },
   { "mts",                required_argument, NULL, 0 },
   { "no-mts",                   no_argument, NULL, 0 },
-  { "tr-depth-intra",     required_argument, NULL, 0 },
   { "me",                 required_argument, NULL, 0 },
   { "subme",              required_argument, NULL, 0 },
   { "source-scan-type",   required_argument, NULL, 0 },
@@ -178,6 +177,8 @@ static const struct option long_options[] = {
   { "no-mip",                   no_argument, NULL, 0 },
   { "lfnst",                    no_argument, NULL, 0 },
   { "no-lfnst",                 no_argument, NULL, 0 },
+  { "isp",                      no_argument, NULL, 0 },
+  { "no-isp",                   no_argument, NULL, 0 },
   { "jccr",                     no_argument, NULL, 0 },
   { "no-jccr",                  no_argument, NULL, 0 },
   { "amvr",                     no_argument, NULL, 0 },
@@ -191,8 +192,15 @@ static const struct option long_options[] = {
   { "dual-tree",                no_argument, NULL, 0 },
   { "no-dual-tree",             no_argument, NULL, 0 },
   { "cabac-debug-file",   required_argument, NULL, 0 },
+  { "mtt-depth-intra",    required_argument, NULL, 0 },
+  { "mtt-depth-inter",    required_argument, NULL, 0 },
+  { "mtt-depth-intra-chroma", required_argument, NULL, 0 },
+  { "max-bt-size",        required_argument, NULL, 0 },
+  { "max-tt-size",        required_argument, NULL, 0 },
   { "intra-rough-granularity",required_argument, NULL, 0 },
   { "ibc",                required_argument, NULL, 0 },
+  { "dep-quant",                no_argument, NULL, 0 },
+  { "no-dep-quant",             no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -571,6 +579,7 @@ void print_help(void)
     "                                   - full: Full ALF\n"
     "      --(no-)rdoq            : Rate-distortion optimized quantization [enabled]\n"
     "      --(no-)rdoq-skip       : Skip RDOQ for 4x4 blocks. [disabled]\n"
+    "      --(no-)dep-quant       : Use dependent quantization. [disabled]\n"
     "      --(no-)signhide        : Sign hiding [disabled]\n"
     "      --rd <integer>         : Intra mode search complexity [0]\n"
     "                                   - 0: Skip intra if inter is good enough.\n"
@@ -602,14 +611,14 @@ void print_help(void)
     "                                   - 2: + 1/2-pixel diagonal\n"
     "                                   - 3: + 1/4-pixel horizontal and vertical\n"
     "                                   - 4: + 1/4-pixel diagonal\n"
-    "      --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]\n"
-    "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
+    "      --pu-depth-inter <int>-<int> : Maximum and minimum split depths where\n"
+    "                                     inter search is performed 0..8. [0-3]\n"
     "                                   - Accepts a list of values separated by ','\n"
     "                                     for setting separate depths per GOP layer\n"
     "                                     (values can be omitted to use the first\n"
     "                                     value for the respective layer).\n"
-    "      --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]\n"
-    "                                   - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
+    "      --pu-depth-intra <int>-<int> : Maximum and minimum split depths where\n"
+    "                                     intra search is performed 0..8. [1-4]\n"
     "                                   - Accepts a list of values separated by ','\n"
     "                                     for setting separate depths per GOP layer\n"
     "                                     (values can be omitted to use the first\n"
@@ -617,6 +626,22 @@ void print_help(void)
     "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
     "                                learning trees, overrides the\n"
     "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --mtt-depth-intra      : Depth of mtt for intra slices 0..3.[0]\n"
+    "      --mtt-depth-intra-chroma : Depth of mtt for chroma dual tree in\n"
+    "                                      intra slices 0..3.[0]\n"
+    "      --mtt-depth-inter      : Depth of mtt for inter slices 0..3.[0]\n"
+    "                              All MTTs are currently experimental and\n"
+    "                              require disabling some avx2 optimizations.\n"
+    "      --max-bt-size          : maximum size for a CU resulting from\n"
+    "                                   a bt split. A singular value shared for all\n"
+    "                                   or a list of three values for the different\n"
+    "                                   slices types (intra, inter, intra-chroma)\n"
+    "                                   can be provided. [64, 64, 32]\n"
+    "      --max-tt-size          : maximum size for a CU resulting from\n"
+    "                                   a tt split. A singular value shared for all\n"
+    "                                   or a list of three values for the different\n"
+    "                                   slices types (intra, inter, intra-chroma)\n"
+    "                                   can be provided. [64, 64, 32]\n"
     "      --intra-rough-granularity : How many levels are used for the\n"
     "                                   logarithmic intra rough search. 0..4\n"
     "                                   With 0 all of the modes are checked \n"
@@ -634,7 +659,6 @@ void print_help(void)
     "                               This is mostly for debugging and is not\n"
     "                               guaranteed to produce sensible bitstream or\n"
     "                               work at all. [disabled]\n"
-    "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
     "      --(no-)bipred          : Bi-prediction [disabled]\n"
     "      --cu-split-termination <string> : CU split search termination [zero]\n"
     "                                   - off: Don't terminate early.\n"
@@ -671,6 +695,9 @@ void print_help(void)
     "      --(no-)mip             : Enable matrix weighted intra prediction.\n"
     "      --(no-)lfnst           : Enable low frequency non-separable transform.\n"
     "                                 [disabled]\n"
+    "      --(no-)isp             : Enable intra sub partitions. [disabled]\n"
+    "                               Experimental, requires disabling some avx2\n"
+    "                               optimizations.\n"
     "      --mts <string>         : Multiple Transform Selection [off].\n"
     "                               (Currently only implemented for intra\n"
     "                               and has effect only when rd >= 2)\n"
diff --git a/src/context.c b/src/context.c
index 83bd5502..30861849 100644
--- a/src/context.c
+++ b/src/context.c
@@ -50,6 +50,21 @@ static const uint8_t  INIT_QT_SPLIT_FLAG[4][6] = {
   {   0,   8,   8,  12,  12,   8, },
 };
 
+
+static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = {
+  {  43,  42,  37,  42,  44, },
+  {  43,  35,  37,  34,  52, },
+  {  43,  42,  29,  27,  44, },
+  {   9,   8,   9,   8,   5, },
+};
+
+static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = {
+  {  28,  29,  28,  29, },
+  {  43,  37,  21,  22, },
+  {  36,  45,  36,  45, },
+  {  12,  13,  12,  13, },
+  };
+
 static const uint8_t INIT_SKIP_FLAG[4][3] = {
   {  57,  60,  46, },
   {  57,  59,  45, },
@@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
     uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]);
     uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]);
     uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]);
+    uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]);
+  }
+
+  for (i = 0; i < 5; i++) {
+    uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]);
   }
 
   for (i = 0; i < 6; i++) {  
@@ -618,13 +638,14 @@ void uvg_context_copy(encoder_state_t * const target_state, const encoder_state_
 uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,
                                       uint32_t pos_x,
                                       uint32_t pos_y,
-                                      int32_t width)
+                                      int32_t width,
+                                      int32_t height)
 {
   uint32_t uiRight = 0;
   uint32_t uiLower = 0;
   uint32_t position = pos_y * width + pos_x;
   if (pos_x + 1 < (uint32_t)width) uiRight = sig_coeff_group_flag[position + 1];
-  if (pos_y + 1 < (uint32_t)width) uiLower = sig_coeff_group_flag[position + width];
+  if (pos_y + 1 < (uint32_t)height) uiLower = sig_coeff_group_flag[position + width];
 
   return uiRight || uiLower;
 }
@@ -656,7 +677,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
 * \returns context index for current scan position
 */
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t height, uint32_t width, int8_t type,
+                                         uint32_t width, uint32_t height, int8_t color,
                                          int32_t* temp_diag, int32_t* temp_sum)
 {
   const coeff_t* data = coeff + pos_x + pos_y * width;
@@ -686,7 +707,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u
   }
 #undef UPDATE
   int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0);
-  if (type == 0 /* Luma */)
+  if (color == COLOR_Y)
   {
     ctx_ofs += diag < 5 ? 4 : 0;
   }
@@ -814,7 +835,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos
 * \returns context go rice parameter
 */
 uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel)
+                             uint32_t width, uint32_t height, uint32_t baselevel)
 {
 #define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/
 
@@ -856,8 +877,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
 * \returns context go rice parameter
 */
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-  uint32_t height, uint32_t width, uint32_t baselevel)
+  uint32_t width, uint32_t height, uint32_t baselevel)
 {
-  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel);
+  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel);
   return  g_go_rice_pars[check];  
 }
\ No newline at end of file
diff --git a/src/context.h b/src/context.h
index 366a438a..f083e44c 100644
--- a/src/context.h
+++ b/src/context.h
@@ -49,10 +49,10 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice);
 
 void uvg_context_copy(encoder_state_t * target_state, const encoder_state_t * source_state);
 
-uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width);
+uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width, int32_t height);
 uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, int32_t width);
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t height, uint32_t width, int8_t type, 
+                                         uint32_t width, uint32_t height, int8_t type, 
                                          int32_t* temp_diag, int32_t* temp_sum);
 
 uint32_t uvg_context_get_sig_ctx_idx_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos_y,
@@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
                      uint32_t height, uint32_t width, uint32_t baselevel);
 
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel);
+                             uint32_t width, uint32_t height, uint32_t baselevel);
 
 #define CNU 35
 #define DWS 8
diff --git a/src/cu.c b/src/cu.c
index 40fce65e..d7c37108 100644
--- a/src/cu.c
+++ b/src/cu.c
@@ -34,6 +34,9 @@
 #include <stdlib.h>
 
 #include "cu.h"
+
+#include "alf.h"
+#include "encoderstate.h"
 #include "threads.h"
 
 
@@ -97,6 +100,42 @@ cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 }
 
 
+void uvg_get_isp_cu_arr_coords(int *x, int *y, int dim)
+{
+  // Do nothing if dimensions are divisible by 4
+  if (*y % 4 == 0 && *x % 4 == 0) return;
+  const int remainder_y = *y % 4;
+  const int remainder_x = *x % 4;
+
+  if (remainder_y != 0) {
+    // Horizontal ISP split
+    if (remainder_y % 2 == 0 && dim == 8) {
+      // 8x2 block
+      *y -= 2;
+      *x += 4;
+    }
+    else {
+      // 16x1 block
+      *y -= remainder_y;
+      *x += remainder_y * 4;
+    }
+  }
+  else {
+    // Vertical ISP split
+    if (*x % 2 == 0 && dim == 8) {
+      // 2x8 block
+      *y += 4;
+      *x -= 2;
+    }
+    else {
+      // 1x16 block
+      *y += remainder_x * 4;
+      *x -= remainder_x;
+    }
+  }
+}
+
+
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px)
 {
   assert(x_px < cua->width);
@@ -237,10 +276,10 @@ cu_array_t * uvg_cu_array_copy_ref(cu_array_t* cua)
  * \param dst_y   y-coordinate of the top edge of the copied area in dst
  * \param src     source lcu
  */
-void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type tree_type)
+void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src)
 {
   const int dst_stride = dst->stride >> 2;
-  const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C;
+  const int width = LCU_WIDTH;
   for (int y = 0; y < width; y += SCU_WIDTH) {
     for (int x = 0; x < width; x += SCU_WIDTH) {
       const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y);
@@ -251,3 +290,215 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
     }
   }
 }
+
+/*
+ * \brief Constructs cu_loc_t based on given parameters. Calculates chroma dimensions automatically.
+ *
+ * \param loc     Destination cu_loc.
+ * \param x       Block top left x coordinate.
+ * \param y       Block top left y coordinate.
+ * \param width   Block width.
+ * \param height  Block height.
+*/
+void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
+{
+  assert(x >= 0 && y >= 0 && width >= 0 && height >= 0 && "Cannot give negative coordinates or block dimensions.");
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Luma CU dimension exceeds maximum (dim > LCU_WIDTH).");
+  // This check is no longer valid. With non-square blocks and ISP enabled, even 1x16 and 16x1 (ISP needs at least 16 samples) blocks are valid
+  //assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4.");
+  
+  loc->x = x;
+  loc->y = y;
+  loc->local_x = x % LCU_WIDTH;
+  loc->local_y = y % LCU_WIDTH;
+  loc->width = width;
+  loc->height = height;
+  // TODO: when MTT is implemented, chroma dimensions can be minimum 2.
+  // Chroma width is half of luma width, when not at maximum depth.
+  loc->chroma_width = width >> 1;
+  loc->chroma_height = height >> 1;
+}
+
+
+int uvg_get_split_locs(
+  const cu_loc_t* const origin,
+  enum split_type split,
+  cu_loc_t out[4],
+  uint8_t* separate_chroma)
+{
+  const int half_width = origin->width >> 1;
+  const int half_height = origin->height >> 1;
+  const int quarter_width = origin->width >> 2;
+  const int quarter_height = origin->height >> 2;
+  if (origin->width == 4 && separate_chroma) *separate_chroma = 1;
+
+  switch (split) {
+    case NO_SPLIT:
+      assert(0 && "trying to get split from no split");
+    break;
+    case QT_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, half_height);
+      uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height);
+      uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height);
+      uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height);
+      if (half_height == 4 && separate_chroma) *separate_chroma = 1;
+      return 4;
+    case BT_HOR_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height);
+      uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height);
+      if (half_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
+      return 2;
+    case BT_VER_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
+      uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
+      if ((half_width == 4 || half_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
+      return 2;
+    case TT_HOR_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
+      uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height);
+      uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height);
+      if (quarter_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
+      return 3;
+    case TT_VER_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
+      uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
+      uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
+      if ((quarter_width == 4 || quarter_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
+      return 3;
+  }
+  return 0;
+}
+
+
+int uvg_get_implicit_split(
+  const encoder_state_t* const state,
+  const cu_loc_t* const cu_loc,
+  uint8_t max_mtt_depth)
+{
+  bool right_ok = (state->tile->frame->width) >= cu_loc->x + cu_loc->width;
+  bool bottom_ok = (state->tile->frame->height) >= cu_loc->y + cu_loc->height;
+
+  if (right_ok && bottom_ok) return NO_SPLIT;
+  if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT;
+  if (bottom_ok && max_mtt_depth != 0) return BT_VER_SPLIT;
+  return QT_SPLIT;
+}
+
+
+int uvg_get_possible_splits(const encoder_state_t * const state,
+                            const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6])
+{
+  const unsigned width = cu_loc->width;
+  const unsigned height = cu_loc->height;
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
+
+  const unsigned max_btd =
+    state->encoder_control->cfg.max_btt_depth[slice_type] + split_tree.implicit_mtt_depth;
+  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type];
+  const unsigned min_bt_size = 1 << MIN_SIZE;
+  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type];
+  const unsigned min_tt_size = 1 << MIN_SIZE;
+  const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
+
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd);
+  
+  splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
+  bool can_btt = split_tree.mtt_depth < max_btd;
+  
+  const enum split_type last_split = GET_SPLITDATA(&split_tree, split_tree.current_depth - 1);
+  const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
+
+  // don't allow QT-splitting below a BT split
+  if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false;
+  if (width <= min_qt_size)                              splits[QT_SPLIT] = false;
+
+  if (tree_type == UVG_CHROMA_T && width <= 8) splits[QT_SPLIT] = false;
+
+  if (implicitSplit != NO_SPLIT)
+  {
+    splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false;
+
+    splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT && height <= max_bt_size;
+    splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT && width <= max_bt_size;
+    if (tree_type == UVG_CHROMA_T && width <= 8) splits[BT_VER_SPLIT] = false;
+    if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true;
+    return 1;
+  }
+
+  if ((last_split == TT_HOR_SPLIT || last_split == TT_VER_SPLIT) && split_tree.part_index == 1)
+  {
+    splits[BT_HOR_SPLIT] = parl_split != BT_HOR_SPLIT;
+    splits[BT_VER_SPLIT] = parl_split != BT_VER_SPLIT;
+  }
+
+  if (can_btt && (width <= min_bt_size && height <= min_bt_size)
+    && ((width <= min_tt_size && height <= min_tt_size)))
+  {
+    can_btt = false;
+  }
+  if (can_btt && (width > max_bt_size || height > max_bt_size)
+    && ((width > max_tt_size || height > max_tt_size)))
+  {
+    can_btt = false;
+  }
+
+  if (!can_btt)
+  {
+    splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
+
+    return 0;
+  }
+
+  if (width > max_bt_size || height > max_bt_size)
+  {
+    splits[BT_HOR_SPLIT] = splits[BT_VER_SPLIT] = false;
+  }
+
+  // specific check for BT splits
+  if (height <= min_bt_size)                            splits[BT_HOR_SPLIT] = false;
+  if (width > 64 && height <= 64) splits[BT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 64)     splits[BT_HOR_SPLIT] = false;
+
+  if (width <= min_bt_size)                              splits[BT_VER_SPLIT] = false;
+  if (width <= 64 && height > 64) splits[BT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 64 || width <= 8))     splits[BT_VER_SPLIT] = false;
+
+  //if (modeType == MODE_TYPE_INTER && width * height == 32)  splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
+
+  if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
+    splits[TT_HOR_SPLIT] = false;
+  if (width > 64 || height > 64)  splits[TT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 64 * 2)     splits[TT_HOR_SPLIT] = false;
+
+  if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
+    splits[TT_VER_SPLIT] = false;
+  if (width > 64 || height > 64)  splits[TT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 64 * 2 || width <= 16))     splits[TT_VER_SPLIT] = false;
+
+  //if (modeType == MODE_TYPE_INTER && width * height == 64)  splits[TT_VER_SPLIT] = splits[TT_HOR_SPLIT] = false;
+  return 0;
+}
+
+
+int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left)
+{
+  if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) {
+    return 0;
+  }
+  if (left && cu_loc->local_x == 0) return (LCU_WIDTH - cu_loc->local_y) / 4;
+  if (!left && cu_loc->local_y == 0) return (cu_loc->width) / 2;
+
+  int amount = left ? cu_loc->height & ~3 : cu_loc->width & ~3;
+  if(left) {
+    const cu_info_t* cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
+    if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu->log2_height == 6 && cu->log2_width == 6) return 8;
+    while (cu_loc->local_y + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET) {
+      amount += TR_MIN_WIDTH;
+    }
+    return MAX(amount / TR_MIN_WIDTH, cu_loc->height / TR_MIN_WIDTH);
+  }
+  while (cu_loc->local_x + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
+    amount += TR_MIN_WIDTH;
+  }
+  return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
+}
diff --git a/src/cu.h b/src/cu.h
index ddddaf55..8f3ec8bf 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -77,55 +77,6 @@ typedef enum {
   MTS_TR_NUM    = 6,
 } mts_idx;
 
-extern const uint8_t uvg_part_mode_num_parts[];
-extern const uint8_t uvg_part_mode_offsets[][4][2];
-extern const uint8_t uvg_part_mode_sizes[][4][2];
-
-/**
- * \brief Get the x coordinate of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param cu_x        x coordinate of the containing CU
- * \param i           number of the PU
- * \return            location of the left edge of the PU
- */
-#define PU_GET_X(part_mode, cu_width, cu_x, i) \
-  ((cu_x) + uvg_part_mode_offsets[(part_mode)][(i)][0] * (cu_width) / 4)
-
-/**
- * \brief Get the y coordinate of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param cu_y        y coordinate of the containing CU
- * \param i           number of the PU
- * \return            location of the top edge of the PU
- */
-#define PU_GET_Y(part_mode, cu_width, cu_y, i) \
-  ((cu_y) + uvg_part_mode_offsets[(part_mode)][(i)][1] * (cu_width) / 4)
-
-/**
- * \brief Get the width of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param i           number of the PU
- * \return            width of the PU
- */
-#define PU_GET_W(part_mode, cu_width, i) \
-  (uvg_part_mode_sizes[(part_mode)][(i)][0] * (cu_width) / 4)
-
-/**
- * \brief Get the height of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param i           number of the PU
- * \return            height of the PU
- */
-#define PU_GET_H(part_mode, cu_width, i) \
-  (uvg_part_mode_sizes[(part_mode)][(i)][1] * (cu_width) / 4)
 
 //////////////////////////////////////////////////////////////////////////
 // TYPES
@@ -142,24 +93,53 @@ enum uvg_tree_type {
   UVG_CHROMA_T = 2
 };
 
+enum split_type {
+  NO_SPLIT = 0,
+  QT_SPLIT = 1,
+  BT_HOR_SPLIT = 2,
+  BT_VER_SPLIT = 3,
+  TT_HOR_SPLIT = 4,
+  TT_VER_SPLIT = 5,
+};
+
+typedef struct  {
+  uint32_t split_tree;
+  uint8_t current_depth;
+  uint8_t mtt_depth;
+  uint8_t implicit_mtt_depth;
+  uint8_t part_index;
+} split_tree_t;
+
+
+// Split for each depth takes three bits like xxy where if either x bit is set
+// it is a MTT split, and if there are any MTT split QT split is not allowed
+#define CAN_QT_SPLIT(x) (((x) & 6DB6DB6) == 0)
+
 /**
  * \brief Struct for CU info
  */
 typedef struct
 {
   uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
-  uint8_t depth       : 3; //!< \brief depth / size of this block
-  uint8_t part_size   : 3; //!< \brief partition mode, one of part_mode_t values
-  uint8_t tr_depth    : 3; //!< \brief transform depth
   uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
   uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
   uint8_t merge_idx   : 3; //!< \brief merge index
   uint8_t tr_skip     : 3; //!< \brief transform skip flag
   uint8_t tr_idx      : 3; //!< \brief transform index
-  uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 
+  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 
+
+  uint8_t log2_width : 3;
+  uint8_t log2_height : 3;
+
+  uint8_t log2_chroma_width : 3;
+  uint8_t log2_chroma_height : 3;
 
   uint16_t cbf;
 
+  uint8_t root_cbf;
+
+  uint32_t split_tree : 3 * 9;
+
   /**
    * \brief QP used for the CU.
    *
@@ -172,12 +152,15 @@ typedef struct
   uint8_t violates_mts_coeff_constraint : 1;
   uint8_t mts_last_scan_pos : 1;
 
-  uint8_t violates_lfnst_constrained_luma : 1; // Two types, luma and chroma. Luma index is 0.
-  uint8_t violates_lfnst_constrained_chroma : 1; // Two types, luma and chroma. Luma index is 0.
+  uint8_t violates_lfnst_constrained_luma : 1;
+  uint8_t violates_lfnst_constrained_chroma : 1;
   uint8_t lfnst_last_scan_pos : 1;
   uint8_t lfnst_idx : 2;
   uint8_t cr_lfnst_idx : 2;
 
+  uint8_t luma_deblocking : 2;
+  uint8_t chroma_deblocking : 2;
+
   union {
     struct {
       int8_t mode;
@@ -185,6 +168,9 @@ typedef struct
       uint8_t multi_ref_idx;
       int8_t mip_flag;
       int8_t mip_is_transposed;
+      int8_t isp_mode;
+      uint8_t isp_cbfs : 4;
+      uint8_t isp_index : 2;
     } intra;
     struct {
       mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
@@ -200,12 +186,25 @@ typedef struct
 typedef struct {
   int16_t x;
   int16_t y;
+  uint8_t local_x;
+  uint8_t local_y;
   int8_t width;
   int8_t height;
   int8_t chroma_width;
   int8_t chroma_height;
 } cu_loc_t;
 
+void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
+typedef struct encoder_state_t encoder_state_t;
+
+int uvg_get_split_locs(
+  const cu_loc_t* const origin,
+  enum split_type split,
+  cu_loc_t out[4],
+  uint8_t* separate_chroma);
+int uvg_get_possible_splits(const encoder_state_t* const state,
+                            const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]);
+
 
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
   (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
@@ -219,7 +218,7 @@ typedef struct {
     } \
   } while (0)
 
-#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
+#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d part_size=%d coded=%d " \
   "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
   "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
   "intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \
@@ -227,7 +226,7 @@ typedef struct {
   "intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \
   "inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \
   "inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \
-  , (cu).type, (cu).depth, (cu).part_size, (cu).tr_depth, (cu).coded, \
+  , (cu).type, (cu).part_size, (cu).coded, \
   (cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \
   (cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \
   (cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \
@@ -246,6 +245,7 @@ typedef struct cu_array_t {
 } cu_array_t;
 
 cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
+void uvg_get_isp_cu_arr_coords(int* x, int* y, int dim);
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
 
 cu_array_t * uvg_cu_array_alloc(const int width, const int height);
@@ -382,8 +382,9 @@ typedef struct {
   cu_info_t cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1];
 } lcu_t;
 
-void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type
-                                tree_type);
+void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src);
+
+int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);
 
 /**
  * \brief Return pointer to the top right reference CU.
@@ -412,9 +413,11 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
  */
 static INLINE void copy_coeffs(const coeff_t *__restrict src,
                                coeff_t *__restrict dest,
-                               size_t width)
+                               size_t width, size_t height, const int lcu_width)
 {
-  memcpy(dest, src, width * width * sizeof(coeff_t));
+  for (int j = 0; j < height; ++j) {
+    memcpy(dest + j * lcu_width, src + j * lcu_width, width * sizeof(coeff_t));
+  }
 }
 
 
@@ -554,56 +557,52 @@ static INLINE unsigned xy_to_zorder(unsigned width, unsigned x, unsigned y)
 } while(0)
 
 
-#define NUM_CBF_DEPTHS 5
-static const uint16_t cbf_masks[NUM_CBF_DEPTHS] = { 0x1f, 0x0f, 0x07, 0x03, 0x1 };
-
 /**
  * Check if CBF in a given level >= depth is true.
  */
-static INLINE int cbf_is_set(uint16_t cbf, int depth, color_t plane)
+static INLINE int cbf_is_set(uint16_t cbf, color_t plane)
 {
-  return (cbf & (cbf_masks[depth] << (NUM_CBF_DEPTHS * plane))) != 0;
+  return (cbf & (1 << (plane))) != 0;
 }
 
 /**
  * Check if CBF in a given level >= depth is true.
  */
-static INLINE int cbf_is_set_any(uint16_t cbf, int depth)
+static INLINE int cbf_is_set_any(uint16_t cbf)
 {
-  return cbf_is_set(cbf, depth, COLOR_Y) ||
-         cbf_is_set(cbf, depth, COLOR_U) ||
-         cbf_is_set(cbf, depth, COLOR_V);
+  return cbf_is_set(cbf, COLOR_Y) ||
+         cbf_is_set(cbf, COLOR_U) ||
+         cbf_is_set(cbf, COLOR_V);
 }
 
 /**
  * Set CBF in a level to true.
  */
-static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane)
+static INLINE void cbf_set(uint16_t *cbf, color_t plane)
 {
   // Return value of the bit corresponding to the level.
-  *cbf |= (0x10 >> depth) << (NUM_CBF_DEPTHS * plane);
+  *cbf |= (1) << (plane);
 }
 
 /**
  * Set CBF in a level to true if it is set at a lower level in any of
  * the child_cbfs.
  */
-static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], int depth, color_t plane)
+static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], color_t plane)
 {
-  bool child_cbf_set = cbf_is_set(child_cbfs[0], depth + 1, plane) ||
-                       cbf_is_set(child_cbfs[1], depth + 1, plane) ||
-                       cbf_is_set(child_cbfs[2], depth + 1, plane);
+  bool child_cbf_set = cbf_is_set(child_cbfs[0], plane) ||
+                       cbf_is_set(child_cbfs[1], plane) ||
+                       cbf_is_set(child_cbfs[2], plane);
   if (child_cbf_set) {
-    cbf_set(cbf, depth, plane);
+    cbf_set(cbf, plane);
   }
 }
 
 /**
- * Set CBF in a levels <= depth to false.
  */
-static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
+static INLINE void cbf_clear(uint16_t *cbf, color_t plane)
 {
-  *cbf &= ~(cbf_masks[depth] << (NUM_CBF_DEPTHS * plane));
+  *cbf &= ~(1 << (plane));
 }
 
 /**
@@ -611,11 +610,11 @@ static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
  */
 static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
 {
-  cbf_clear(cbf, 0, plane);
-  *cbf |= src & (cbf_masks[0] << (NUM_CBF_DEPTHS * plane));
+  cbf_clear(cbf, plane);
+  *cbf |= src & (1 <<  plane);
 }
 
-#define GET_SPLITDATA(CU,curDepth) ((CU)->depth > curDepth)
-#define SET_SPLITDATA(CU,flag) { (CU)->split=(flag); }
+#define GET_SPLITDATA(CU,curDepth) ((CU)->split_tree >> ((MAX((curDepth), 0) * 3)) & 7)
+#define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE)
 
 #endif
diff --git a/src/dep_quant.c b/src/dep_quant.c
new file mode 100644
index 00000000..16591390
--- /dev/null
+++ b/src/dep_quant.c
@@ -0,0 +1,1139 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "dep_quant.h"
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "transform.h"
+#include "uvg_math.h"
+#include "generic/quant-generic.h"
+
+#include "strategies-depquant.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+    { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+    { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+    { 98304,  98304,  98304,  98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+    {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+static const int g_riceT[4] = { 32,128, 512, 2048 };
+static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
+
+static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
+
+
+int uvg_init_nb_info(encoder_control_t * encoder) {
+  memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray));
+  memset(encoder->m_scanId2NbInfoOutArray, 0, sizeof(encoder->m_scanId2NbInfoOutArray));
+  memset(encoder->scan_info, 0, sizeof(encoder->scan_info));
+  for (int hd = 0; hd <= 6; hd++)
+  {
+
+    uint32_t raster2id[64 * 64] = {0};
+
+    for (int vd = 0; vd <= 6; vd++)
+    {
+      if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0))
+      {
+        continue;
+      }
+      const uint32_t      blockWidth = (1 << hd);
+      const uint32_t      blockHeight = (1 << vd);
+      const uint32_t      log2CGWidth = g_log2_sbb_size[hd][vd][0];
+      const uint32_t      log2CGHeight = g_log2_sbb_size[hd][vd][1];
+      const uint32_t      groupWidth = 1 << log2CGWidth;
+      const uint32_t      groupHeight = 1 << log2CGHeight;
+      const uint32_t      groupSize = groupWidth * groupHeight;
+      const int           scanType = SCAN_DIAG;
+      const uint32_t      blkWidthIdx = hd;
+      const uint32_t      blkHeightIdx = vd;
+      const uint32_t* scanId2RP = uvg_get_scan_order_table(SCAN_GROUP_4X4, scanType, blkWidthIdx, blkHeightIdx);
+      const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, 0, hd, vd);
+      NbInfoSbb** sId2NbSbb = &encoder->m_scanId2NbInfoSbbArray[hd][vd];
+      NbInfoOut** sId2NbOut = &encoder->m_scanId2NbInfoOutArray[hd][vd];
+      // consider only non-zero-out region
+      const uint32_t      blkWidthNZOut = MIN(32, blockWidth);
+      const uint32_t      blkHeightNZOut = MIN(32, blockHeight);
+      const uint32_t      totalValues = blkWidthNZOut * blkHeightNZOut;
+
+      *sId2NbSbb = MALLOC(NbInfoSbb, totalValues);
+      if (*sId2NbSbb == NULL) {
+        return 0;
+      }
+      *sId2NbOut = MALLOC(NbInfoOut, totalValues);
+      if (*sId2NbOut == NULL) {
+        return 0;
+      }
+      encoder->scan_info[hd][vd] = MALLOC(struct dep_quant_scan_info, totalValues);
+      if (encoder->scan_info[hd][vd] == NULL) {
+        return 0;
+      }
+
+
+      for (uint32_t scanId = 0; scanId < totalValues; scanId++)
+      {
+        raster2id[scanId2RP[scanId]] = scanId;
+      }
+      const uint32_t height_in_sbb = MAX(blockHeight >> 2, 1);
+      const uint32_t width_in_sbb = MAX(blockWidth >> 2, 1);
+
+      for (unsigned scanId = 0; scanId < totalValues; scanId++)
+      {
+        const int rpos = scanId2RP[scanId];
+        uint32_t  pos_y = rpos >> hd;
+        uint32_t  pos_x = rpos - (pos_y << hd); // TODO: height
+        {
+          //===== inside subband neighbours =====
+          NbInfoSbb *nbSbb = &(*sId2NbSbb)[scanId];
+          const int      begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock
+          int            cpos[5];
+
+          cpos[0] = (pos_x + 1 < blkWidthNZOut ? (raster2id[rpos + 1] < groupSize + begSbb ? raster2id[rpos + 1] - begSbb : 0) : 0);
+          cpos[1] = (pos_x + 2 < blkWidthNZOut ? (raster2id[rpos + 2] < groupSize + begSbb ? raster2id[rpos + 2] - begSbb : 0) : 0);
+          cpos[2] = (pos_x + 1 < blkWidthNZOut && pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + 1 + blockWidth] < groupSize + begSbb ? raster2id[rpos + 1 + blockWidth] - begSbb : 0) : 0);
+          cpos[3] = (pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + blockWidth] < groupSize + begSbb ? raster2id[rpos + blockWidth] - begSbb : 0) : 0);
+          cpos[4] = (pos_y + 2 < blkHeightNZOut ? (raster2id[rpos + 2 * blockWidth] < groupSize + begSbb ? raster2id[rpos + 2 * blockWidth] - begSbb : 0) : 0);
+
+          for (nbSbb->num = 0; true; )
+          {
+            int nk = -1;
+            for (int k = 0; k < 5; k++)
+            {
+              if (cpos[k] != 0 && (nk < 0 || cpos[k] < cpos[nk]))
+              {
+                nk = k;
+              }
+            }
+            if (nk < 0)
+            {
+              break;
+            }
+            nbSbb->inPos[nbSbb->num++] = (uint8_t)(cpos[nk]);
+            cpos[nk] = 0;
+          }
+          for (int k = nbSbb->num; k < 5; k++)
+          {
+            nbSbb->inPos[k] = 0;
+          }
+        }
+        {
+          //===== outside subband neighbours =====
+          NbInfoOut *nbOut = &(*sId2NbOut)[scanId];
+          const int      begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock
+          int            cpos[5];
+
+          cpos[0] = (pos_x + 1 < blkWidthNZOut ? (raster2id[rpos + 1] >= groupSize + begSbb ? raster2id[rpos + 1] : 0) : 0);
+          cpos[1] = (pos_x + 2 < blkWidthNZOut ? (raster2id[rpos + 2] >= groupSize + begSbb ? raster2id[rpos + 2] : 0) : 0);
+          cpos[2] = (pos_x + 1 < blkWidthNZOut && pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + 1 + blockWidth] >= groupSize + begSbb ? raster2id[rpos + 1 + blockWidth] : 0) : 0);
+          cpos[3] = (pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + blockWidth] >= groupSize + begSbb ? raster2id[rpos + blockWidth] : 0) : 0);
+          cpos[4] = (pos_y + 2 < blkHeightNZOut ? (raster2id[rpos + 2 * blockWidth] >= groupSize + begSbb ? raster2id[rpos + 2 * blockWidth] : 0) : 0);
+
+          for (nbOut->num = 0; true; )
+          {
+            int nk = -1;
+            for (int k = 0; k < 5; k++)
+            {
+              if (cpos[k] != 0 && (nk < 0 || cpos[k] < cpos[nk]))
+              {
+                nk = k;
+              }
+            }
+            if (nk < 0)
+            {
+              break;
+            }
+            nbOut->outPos[nbOut->num++] = (uint16_t)(cpos[nk]);
+            cpos[nk] = 0;
+          }
+          for (int k = nbOut->num; k < 5; k++)
+          {
+            nbOut->outPos[k] = 0;
+          }
+          nbOut->maxDist = (scanId == 0 ? 0 : (*sId2NbOut)[scanId - 1].maxDist);
+          for (int k = 0; k < nbOut->num; k++)
+          {
+            if (nbOut->outPos[k] > nbOut->maxDist)
+            {
+              nbOut->maxDist = nbOut->outPos[k];
+            }
+          }
+        }
+        uint32_t cg_pos = cg_scan[scanId >> 4];
+
+        uint32_t blkpos_next = scanId2RP[scanId ? scanId - 1 : 0];
+        uint32_t  pos_y_next = blkpos_next >> hd;
+        uint32_t  pos_x_next = blkpos_next - (pos_y_next << hd);
+        uint32_t cg_blockpos_next = scanId ? cg_scan[(scanId - 1) >> 4] : 0;
+        uint32_t cg_pos_y_next = cg_blockpos_next / width_in_sbb;
+        uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * width_in_sbb);
+        uint32_t diag = pos_y_next + pos_x_next;
+        
+
+        uint32_t nextSbbRight = (cg_pos_x_next < width_in_sbb - 1 ? cg_blockpos_next + 1 : 0);
+        uint32_t nextSbbBelow = (cg_pos_y_next < height_in_sbb - 1 ? cg_blockpos_next + width_in_sbb : 0);
+        encoder->scan_info[hd][vd][scanId].pos_x = pos_x;
+        encoder->scan_info[hd][vd][scanId].pos_y = pos_y;
+        encoder->scan_info[hd][vd][scanId].sig_ctx_offset[0] = (diag < 2 ? 8 : diag < 5 ? 4 : 0);
+        encoder->scan_info[hd][vd][scanId].sig_ctx_offset[1] = (diag < 2 ? 4 : 0);
+        encoder->scan_info[hd][vd][scanId].gtx_ctx_offset[0] = (diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1);
+        encoder->scan_info[hd][vd][scanId].gtx_ctx_offset[1] = (diag < 1 ? 6 : 1);
+        encoder->scan_info[hd][vd][scanId].cg_pos = cg_pos;
+        encoder->scan_info[hd][vd][scanId].next_sbb_right = nextSbbRight;
+        encoder->scan_info[hd][vd][scanId].next_sbb_below = nextSbbBelow;
+      }
+
+      // make it relative
+      for (unsigned scanId = 0; scanId < totalValues; scanId++)
+      {
+        NbInfoOut *nbOut = &(*sId2NbOut)[scanId];
+        const int  begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock
+        for (int k = 0; k < nbOut->num; k++)
+        {
+          nbOut->outPos[k] -= begSbb;
+        }
+        nbOut->maxDist -= scanId;
+      }
+    }
+  }
+  return 1;
+}
+
+void uvg_dealloc_nb_info(encoder_control_t* encoder) {
+
+  for (int hd = 0; hd <= 7; hd++) {
+    for (int vd = 0; vd <= 7; vd++)
+    {
+      if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0))
+      {
+        continue;
+      }
+      if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoOutArray[hd][vd]);
+      if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoSbbArray[hd][vd]);
+      if(encoder->scan_info[hd][vd]) FREE_POINTER(encoder->scan_info[hd][vd]);
+    }
+  }
+}
+
+
+static INLINE int ceil_log2(uint64_t x)
+{
+  static const uint64_t t[6] = { 0xFFFFFFFF00000000ull, 0x00000000FFFF0000ull, 0x000000000000FF00ull, 0x00000000000000F0ull, 0x000000000000000Cull, 0x0000000000000002ull };
+  int y = (((x & (x - 1)) == 0) ? 0 : 1);
+  int j = 32;
+  for (int i = 0; i < 6; i++)
+  {
+    int k = (((x & t[i]) == 0) ? 0 : j);
+    y += k;
+    x >>= k;
+    j >>= 1;
+  }
+  return y;
+}
+
+static void init_quant_block(
+  const encoder_state_t* state,
+  quant_block*           qp,
+  const cu_info_t* const cur_tu,
+  unsigned               log2_width,
+  unsigned               log2_height,
+  color_t                color,
+  const bool             needsSqrt2ScaleAdjustment,
+  const int              gValue)
+{
+  double     lambda = color == COLOR_Y ? state->lambda : state->c_lambda;
+
+  const int  qpDQ = state->qp + 1;
+  const int  qpPer = qpDQ / 6;
+  const int  qpRem = qpDQ - 6 * qpPer;
+  const int  channelBitDepth = state->encoder_control->bitdepth;
+  const int  maxLog2TrDynamicRange = MAX_TR_DYNAMIC_RANGE;
+  const int  nomTransformShift = MAX_TR_DYNAMIC_RANGE - channelBitDepth - ((log2_width + log2_height) >> 1);
+  const bool clipTransformShift = (cur_tu->tr_skip >> color) & 1 && false; // extended precision
+  const int  transformShift =
+    (clipTransformShift ? MAX(0, nomTransformShift) :
+                          nomTransformShift) +
+    (needsSqrt2ScaleAdjustment ? -1 : 0);
+  // quant parameters
+  qp->m_QShift = QUANT_SHIFT - 1 + qpPer + transformShift;
+  qp->m_QAdd = -((3 << qp->m_QShift) >> 1);
+  int invShift = IQUANT_SHIFT + 1 - qpPer - transformShift;
+  qp->m_QScale = uvg_g_quant_scales[needsSqrt2ScaleAdjustment ? 1 : 0][qpRem];
+  const unsigned qIdxBD = MIN(
+    maxLog2TrDynamicRange + 1,
+    8 * sizeof(int) + invShift - IQUANT_SHIFT - 1);
+  qp->m_maxQIdx = (1 << (qIdxBD - 1)) - 4;
+  qp->m_thresLast = (((int64_t)(4) << (int64_t)qp->m_QShift));
+  qp->m_thresSSbb = (((int64_t)(3) << (int64_t)qp->m_QShift));
+  // distortion calculation parameters
+  const int64_t qScale = (gValue == -1) ? qp->m_QScale : gValue;
+  const int     nomDShift =
+    15 -
+    2 * (nomTransformShift) +
+    qp->m_QShift + (needsSqrt2ScaleAdjustment ? 1 : 0);
+  const double qScale2 = (double)(qScale * qScale);
+  const double nomDistFactor =
+    (nomDShift < 0 ?
+       1.0 / ((double)((int64_t)(1) << (-nomDShift)) * qScale2 * lambda) :
+       (double)((int64_t)(1) << nomDShift) / (qScale2 * lambda));
+  const int64_t pow2dfShift = (int64_t)(nomDistFactor * qScale2) + 1;
+  const int dfShift = ceil_log2(pow2dfShift);
+  qp->m_DistShift = 62 + qp->m_QShift - 2 * maxLog2TrDynamicRange - dfShift;
+  qp->m_DistAdd = ((int64_t)(1) << qp->m_DistShift) >> 1;
+  qp->m_DistStepAdd = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + qp->m_QShift)) + .5);
+  qp->m_DistOrgFact = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + 1)) + .5);
+  qp->needs_init = false;
+}
+
+static void reset_common_context(common_context* ctx, const rate_estimator_t * rate_estimator, int numSbb, int num_coeff)
+{
+  //memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo));
+  memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
+  uint8_t*  next_sbb_memory   = ctx->sbb_memory;
+  uint8_t*  next_level_memory   = ctx->level_memory;
+  for (int k = 0; k < 2; k++, next_sbb_memory += numSbb * 4llu, next_level_memory += num_coeff * 4llu) {
+    ctx->m_allSbbCtx[k].sbbFlags = next_sbb_memory;
+    ctx->m_allSbbCtx[k].levels = next_level_memory;
+  }
+  ctx->m_curr_sbb_ctx_offset = 0;
+  ctx->m_prev_sbb_ctx_offset = 1;
+  ctx->num_coeff = num_coeff;
+}
+
+static void init_rate_esimator(rate_estimator_t * rate_estimator, const cabac_data_t * const ctx, color_t color)
+{
+  const cabac_ctx_t * base_ctx = color == COLOR_Y ? ctx->ctx.sig_coeff_group_model : (ctx->ctx.sig_coeff_group_model + 2);
+  for (unsigned ctxId = 0; ctxId < SM_MAX_NUM_SIG_SBB_CTX; ctxId++) {
+    rate_estimator->m_sigSbbFracBits[ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0);
+    rate_estimator->m_sigSbbFracBits[ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1);
+  }
+  unsigned numCtx = (color == COLOR_Y ? 12 : 8);
+  for (unsigned ctxSetId = 0; ctxSetId < SM_NUM_CTX_SETS_SIG; ctxSetId++) {
+    base_ctx = color == COLOR_Y ? ctx->ctx.cu_sig_model_luma[ctxSetId] : ctx->ctx.cu_sig_model_chroma[ctxSetId];
+    for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) {
+      rate_estimator->m_sigFracBits[ctxSetId][ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0);
+      rate_estimator->m_sigFracBits[ctxSetId][ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1);
+    }
+  }
+  
+  numCtx    = (color == COLOR_Y? 21 : 11);
+  for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) {
+    const cabac_ctx_t * par_ctx = color == COLOR_Y ? &ctx->ctx.cu_parity_flag_model_luma[ctxId] : &ctx->ctx.cu_parity_flag_model_chroma[ctxId];
+    const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId];
+    const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId];
+
+    int32_t* cb = rate_estimator->m_gtxFracBits[ctxId];
+    int32_t par0    = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 0);
+    int32_t par1 = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 1);
+    cb[0] = 0;
+    cb[1] = CTX_ENTROPY_BITS(gt1_ctx, 0) + (1 << SCALE_BITS);
+    cb[2] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par0 + CTX_ENTROPY_BITS(gt2_ctx, 0);
+    cb[3] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par1 + CTX_ENTROPY_BITS(gt2_ctx, 0);
+    cb[4] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par0 + CTX_ENTROPY_BITS(gt2_ctx, 1);
+    cb[5] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par1 + CTX_ENTROPY_BITS(gt2_ctx, 1);
+  }
+}
+
+
+static void xSetLastCoeffOffset(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const int width,
+  const int height,
+  rate_estimator_t* rate_estimator,
+  const color_t compID)
+{
+  int32_t cbfDeltaBits = 0;
+  if (compID == COLOR_Y && cur_tu->type != CU_INTRA /*&& !tu.depth*/) {
+    cbfDeltaBits = (int32_t)CTX_ENTROPY_BITS(&state->search_cabac.ctx.cu_qt_root_cbf_model, 1) - (int32_t)CTX_ENTROPY_BITS(&state->search_cabac.ctx.cu_qt_root_cbf_model, 0);
+  } else {
+    bool        prevLumaCbf           = false;
+    bool        lastCbfIsInferred     = false;
+    bool        useIntraSubPartitions = cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode && compID == COLOR_Y;
+    if (useIntraSubPartitions) {
+      uint32_t nTus = uvg_get_isp_split_num(1 << cur_tu->log2_width, 1 << cur_tu->log2_height, cur_tu->intra.isp_mode, true);
+      bool     isLastSubPartition = cur_tu->intra.isp_index +1 == nTus; //TODO: isp check
+      if (isLastSubPartition) {
+        lastCbfIsInferred = cur_tu->intra.isp_cbfs == 0;
+      }
+      if (!lastCbfIsInferred) {
+        prevLumaCbf = cur_tu->intra.isp_index != 0 && (cur_tu->intra.isp_cbfs & (1 << (cur_tu->intra.isp_index - 1)));
+      }
+      const cabac_ctx_t * const cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[2 + prevLumaCbf];
+      cbfDeltaBits = lastCbfIsInferred ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
+    }
+    else {
+      const cabac_ctx_t* cbf_ctx;
+      switch (compID) {
+        case COLOR_Y:
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[0];
+          break;
+        case COLOR_U:
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cb[0];
+          break;
+        case COLOR_V:
+          cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cr[cbf_is_set(cur_tu->cbf, COLOR_U)];
+          break;
+      }
+      cbfDeltaBits = compID != COLOR_Y && cur_tu->joint_cb_cr ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0);
+    }
+     
+  }
+
+  static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21};
+  uint32_t              ctxBits[14];
+  for (unsigned xy = 0; xy < 2; xy++) {
+    int32_t        bitOffset  = (xy ? cbfDeltaBits : 0);
+    int32_t*       lastBits   = (xy ? rate_estimator->m_lastBitsY : rate_estimator->m_lastBitsX);
+    const unsigned size = (xy ? (height) : (width));
+    const unsigned log2Size   = uvg_math_ceil_log2(size);
+    const bool     useYCtx    = (xy != 0);
+    const cabac_ctx_t* const ctxSetLast = useYCtx ?
+        (compID == COLOR_Y ? state->search_cabac.ctx.cu_ctx_last_y_luma : state->search_cabac.ctx.cu_ctx_last_y_chroma) :
+        (compID == COLOR_Y ? state->search_cabac.ctx.cu_ctx_last_x_luma : state->search_cabac.ctx.cu_ctx_last_x_chroma);
+    const unsigned lastShift = (compID == COLOR_Y ? (log2Size + 1) >> 2 : CLIP(0, 2, size >> 3));
+    const unsigned lastOffset = (compID == COLOR_Y ? (prefixCtx[log2Size]) : 0);
+    uint32_t sumFBits = 0;
+    unsigned maxCtxId = g_group_idx[MIN(32, size) - 1];
+    for (unsigned ctxId = 0; ctxId < maxCtxId; ctxId++) {
+      ctxBits[ctxId] = sumFBits
+        + CTX_ENTROPY_BITS(&ctxSetLast[lastOffset + (ctxId >> lastShift)], 0)
+        + (ctxId > 3 ? ((ctxId - 2) >> 1) << SCALE_BITS : 0)
+        + bitOffset;
+      sumFBits += CTX_ENTROPY_BITS(&ctxSetLast[lastOffset + (ctxId >> lastShift)], 1);
+    }
+    ctxBits[maxCtxId] = sumFBits + (maxCtxId > 3 ? ((maxCtxId - 2) >> 1) << SCALE_BITS : 0) + bitOffset;
+    for (unsigned pos = 0; pos < MIN(32, size); pos++) {
+      lastBits[pos] = ctxBits[g_group_idx[pos]];
+    }
+  }
+}
+
+
+static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2], uint32_t gtx_frac_bits[6])
+{
+  state->m_rdCost = INT64_MAX >> 1;
+  state->m_numSigSbb = 0;
+  state->m_remRegBins = 4; // just large enough for last scan pos
+  state->m_refSbbCtxId = -1;
+  state->m_sigFracBits[0] = sig_frac_bits[0];
+  state->m_sigFracBits[1] = sig_frac_bits[1];
+  memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(state->m_coeffFracBits));
+  state->m_goRicePar = 0;
+  state->m_goRiceZero = 0;
+
+  state->m_sbbFracBits[0] = 0;
+  state->m_sbbFracBits[1] = 0;
+}
+
+
+void uvg_dep_quant_check_rd_costs(
+  const all_depquant_states * const state,
+  const enum ScanPosType            spt,
+  const PQData *                    pqDataA,
+  Decision *                        decisions,
+  const int                         decisionA,
+  const int                         decisionB,
+  const int                         state_offset)
+{
+  const int pqA = decisionA && decisionB ? 3 : 0;
+  const int pqB = decisionA && decisionB ? 1 : 2;
+  const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+  int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+  int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
+  int64_t rdCostZ = state->m_rdCost[state_offset];
+  if (state->m_remRegBins[state_offset] >= 4) {
+    if (pqDataA->absLevel[pqA] < 4) {
+      rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+    }
+    else {
+      const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+      rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+    }
+    if (pqDataA->absLevel[pqB] < 4) {
+      rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+    }
+    else {
+      const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+      rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+    }
+    if (spt == SCAN_ISCSBB) {
+      rdCostA += state->m_sigFracBits[state_offset][1];
+      rdCostB += state->m_sigFracBits[state_offset][1];
+      rdCostZ += state->m_sigFracBits[state_offset][0];
+    }
+    else if (spt == SCAN_SOCSBB) {
+      rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+      rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+      rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
+    }
+    else if (state->m_numSigSbb[state_offset]) {
+      rdCostA += state->m_sigFracBits[state_offset][1];
+      rdCostB += state->m_sigFracBits[state_offset][1];
+      rdCostZ += state->m_sigFracBits[state_offset][0];
+    }
+    else {
+      rdCostZ = decisions->rdCost[decisionA];
+    }
+  }
+  else {
+    rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset]
+                                      ? pqDataA->absLevel[pqA] - 1
+                                      : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
+    rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset]
+                                      ? pqDataA->absLevel[pqB] - 1
+                                      : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
+    rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+  }
+  if (rdCostA < decisions->rdCost[decisionA]) {
+    decisions->rdCost[decisionA] = rdCostA;
+    decisions->absLevel[decisionA] = pqDataA->absLevel[pqA];
+    decisions->prevId[decisionA] = state->m_stateId[state_offset];
+  }
+  if (rdCostZ < decisions->rdCost[decisionA]) {
+    decisions->rdCost[decisionA] = rdCostZ;
+    decisions->absLevel[decisionA] = 0;
+    decisions->prevId[decisionA] = state->m_stateId[state_offset];
+  }
+  if (rdCostB < decisions->rdCost[decisionB]) {
+    decisions->rdCost[decisionB] = rdCostB;
+    decisions->absLevel[decisionB] = pqDataA->absLevel[pqB];
+    decisions->prevId[decisionB] = state->m_stateId[state_offset];
+  }
+}
+
+
+static INLINE unsigned templateAbsCompare(coeff_t sum)
+{
+  int rangeIdx = 0;
+  if (sum < g_riceT[0]) {
+    rangeIdx = 0;
+  }
+  else if (sum < g_riceT[1]) {
+    rangeIdx = 1;
+  }
+  else if (sum < g_riceT[2]) {
+    rangeIdx = 2;
+  }
+  else if (sum < g_riceT[3]) {
+    rangeIdx = 3;
+  }
+  else {
+    rangeIdx = 4;
+  }
+  return g_riceShift[rangeIdx];
+}
+
+static INLINE void update_common_context(
+  context_store*   ctxs,
+  common_context * cc,
+  const uint32_t   scan_pos,
+  const uint32_t   cg_pos,
+  const uint32_t   width_in_sbb,
+  const uint32_t   height_in_sbb,
+  const uint32_t   next_sbb_right,
+  const uint32_t   next_sbb_below,
+  const int        prev_state,
+  const int        curr_state)
+{
+  const uint32_t numSbb = width_in_sbb * height_in_sbb;
+  const int      curr_state_without_offset = curr_state & 3;
+  uint8_t*       sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags;
+  uint8_t*       levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels;
+  size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+  int8_t         prev_sbb_state = ctxs->m_allStates.m_refSbbCtxId[prev_state];
+  if (prev_state != -1 && prev_sbb_state >= 0) {
+    for (int i = 0; i < numSbb; ++i) {
+      sbbFlags[i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb_state];
+    }
+    for (int i = 16; i < setCpSize; ++i) {
+      levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[scan_pos * 4 + i * 4 + prev_sbb_state];
+    }
+  }
+  else {
+    for (int i = 0; i < numSbb; ++i) {
+      sbbFlags[i * 4 + curr_state_without_offset] = 0;
+    }
+    for (int i = 16; i < setCpSize; ++i) {
+      levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = 0;
+    }
+  }
+  sbbFlags[cg_pos * 4 + curr_state_without_offset] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
+  for (int i = 0; i < 16; ++i) {
+    levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = ctxs->m_allStates.m_absLevels[curr_state / 4][i * 4 + curr_state_without_offset];
+  }
+
+  const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right * 4 + curr_state_without_offset] : false) 
+                       || (next_sbb_below ? sbbFlags[next_sbb_below* 4 + curr_state_without_offset] : false) ? 1 : 0);
+  ctxs->m_allStates.m_numSigSbb[curr_state] = 0;
+  if (prev_state != -1) {
+    ctxs->m_allStates.m_remRegBins[curr_state] = ctxs->m_allStates.m_remRegBins[prev_state];
+  }
+  else {
+    int ctxBinSampleRatio = 28;
+    // (scanInfo.chType == COLOR_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+    ctxs->m_allStates.m_remRegBins[curr_state] = (ctxs->m_allStates.effWidth * ctxs->m_allStates.effHeight * ctxBinSampleRatio) / 16;
+  }
+  ctxs->m_allStates.m_goRicePar[curr_state] = 0;
+  ctxs->m_allStates.m_refSbbCtxId[curr_state] = curr_state_without_offset;
+  ctxs->m_allStates.m_sbbFracBits[curr_state][0] = cc->m_sbbFlagBits[sigNSbb][0];
+  ctxs->m_allStates.m_sbbFracBits[curr_state][1] = cc->m_sbbFlagBits[sigNSbb][1];
+
+  uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset >> 2];
+  const int scanBeg = scan_pos - 16;
+  const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+  const uint8_t* absLevels = levels + scanBeg * 4;
+  for (int id = 0; id < 16; id++, nbOut++) {
+    if (nbOut->num) {
+      coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0;
+#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k] * 4 + curr_state_without_offset]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+      UPDATE(0);
+      if (nbOut->num > 1) {
+        UPDATE(1);
+        if (nbOut->num > 2) {
+          UPDATE(2);
+          if (nbOut->num > 3) {
+            UPDATE(3);
+            if (nbOut->num > 4) {
+              UPDATE(4);
+            }
+          }
+        }
+      }
+#undef UPDATE
+      templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1 << 3)) + (uint16_t)(MIN(127, sumAbs) << 8);
+    }
+    else {
+      templateCtxInit[curr_state_without_offset + id * 4] = 0;
+    }
+  }
+  for (int i = curr_state_without_offset; i < 64; i += 4) {
+    ctxs->m_allStates.m_absLevels[curr_state >> 2][i] = 0;
+  }
+}
+
+
+
+void uvg_dep_quant_update_state_eos(
+  context_store*   ctxs,
+  const uint32_t   scan_pos,
+  const uint32_t   cg_pos,
+  const uint32_t   sigCtxOffsetNext,
+  const uint32_t   gtxCtxOffsetNext,
+  const uint32_t   width_in_sbb,
+  const uint32_t   height_in_sbb,
+  const uint32_t   next_sbb_right,
+  const uint32_t   next_sbb_below,
+  const Decision * decisions,
+  int              decision_id)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+  int curr_state_offset = ctxs->m_curr_state_offset + decision_id;
+  state->m_rdCost[curr_state_offset] = decisions->rdCost[decision_id];
+  if (decisions->prevId[decision_id] > -2) {
+    int prvState = -1;
+    if (decisions->prevId[decision_id] >= 4) {
+      prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+      state->m_numSigSbb[curr_state_offset] = 0;
+      for (int i = decision_id; i < 64;  i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0;
+      }
+    }
+    else if (decisions->prevId[decision_id] >= 0) {
+      prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+      state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id];
+      for (int i = 0; i < 64;  i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset / 4][i + decision_id] =
+          state->m_absLevels[ctxs->m_prev_state_offset / 4][i + decisions->prevId[decision_id]];
+      }
+    }
+    else {
+      state->m_numSigSbb[curr_state_offset] = 1;
+      for (int i = decision_id; i < 64; i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0;
+      }
+    }
+    uint8_t* temp = &state->m_absLevels[ctxs->m_curr_state_offset / 4][(scan_pos & 15) * 4 + decision_id];
+    *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]);
+
+    update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
+                          next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id);
+
+    coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id];
+    coeff_t sumNum = tinit & 7;
+    coeff_t sumAbs1 = (tinit >> 3) & 31;
+    coeff_t sumGt1 = sumAbs1 - sumNum;
+    state->m_sigFracBits[curr_state_offset][0] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+    state->m_sigFracBits[curr_state_offset][1] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+
+    memcpy(state->m_coeffFracBits[curr_state_offset],
+           state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+  }
+}
+
+
+void uvg_dep_quant_update_state(
+  context_store * ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id) {
+  all_depquant_states* state    = &ctxs->m_allStates;
+  int                  state_id = ctxs->m_curr_state_offset + decision_id;
+  state->m_rdCost[state_id]     = decisions->rdCost[decision_id];
+  int32_t prev_id_no_offset     = decisions->prevId[decision_id];
+  if (prev_id_no_offset > -2) {
+    if (prev_id_no_offset >= 0) {
+      const int prvState = ctxs->m_prev_state_offset + prev_id_no_offset;
+      state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
+      state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+      state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+      state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+      state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+      state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+      if (state->m_remRegBins[state_id] >= 4) {
+        state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2
+                                            ? (unsigned)decisions->absLevel[decision_id]
+                                            : 3);
+      }
+      for (int i = 0; i < 64; i += 4) {
+        state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][decision_id + i] = state->m_ctxInit[ctxs->m_prev_state_offset  >> 2][prev_id_no_offset + i];
+      }
+      for (int i = 0; i < 64; i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset  >> 2][decision_id + i] = state->m_absLevels[ctxs->m_prev_state_offset  >> 2][prev_id_no_offset + i];
+      }
+    }
+    else {
+      state->m_numSigSbb[state_id] = 1;
+      state->m_refSbbCtxId[state_id] = -1;
+      int ctxBinSampleRatio = 28;
+      //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+      state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (
+        decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+      for (int i = decision_id; i < 64; i += 4) {
+        state->m_absLevels[ctxs->m_curr_state_offset >> 2][i] = 0;
+      }
+      for (int i = decision_id; i < 64; i += 4) {
+        state->m_ctxInit[ctxs->m_curr_state_offset >> 2][i] = 0;
+      }
+    }
+    state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
+    state->all_lt_four &= state->m_remRegBins[state_id] < 4;
+    uint8_t* levels = state->m_absLevels[ctxs->m_curr_state_offset >> 2];
+    levels[(scan_pos & 15) * 4 + decision_id] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
+
+    if (state->m_remRegBins[state_id] >= 4) {
+      coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id];
+      coeff_t sumAbs1 = (tinit >> 3) & 31;
+      coeff_t sumNum = tinit & 7;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
+      switch (numIPos) {
+        case 5: UPDATE(4);
+        case 4: UPDATE(3);
+        case 3: UPDATE(2);
+        case 2: UPDATE(1);
+        case 1: UPDATE(0); break;
+        default: assert(0);
+      }
+#undef UPDATE
+      coeff_t sumGt1 = sumAbs1 - sumNum;
+      state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN(
+        (sumAbs1 + 1) >> 1, 3)][0];
+      state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN(
+        (sumAbs1 + 1) >> 1, 3)][1];
+      memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)],
+             sizeof(state->m_coeffFracBits[0]));
+
+
+      coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; }
+      switch (numIPos) {
+        case 5: UPDATE(4);
+        case 4: UPDATE(3);
+        case 3: UPDATE(2);
+        case 2: UPDATE(1);
+        case 1: UPDATE(0); break;
+        default: assert(0);
+      }
+#undef UPDATE
+      if (extRiceFlag) {
+        unsigned currentShift = templateAbsCompare(sumAbs);
+        sumAbs = sumAbs >> currentShift;
+        int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+        state->m_goRicePar[state_id] += currentShift;
+      }
+      else {
+        int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+      }
+    }
+    else {
+      coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset  >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
+#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; }
+      switch (numIPos) {
+        case 5: UPDATE(4);
+        case 4: UPDATE(3);
+        case 3: UPDATE(2);
+        case 2: UPDATE(1);
+        case 1: UPDATE(0); break;
+        default: assert(0);
+      }
+#undef UPDATE
+      if (extRiceFlag) {
+        unsigned currentShift = templateAbsCompare(sumAbs);
+        sumAbs = sumAbs >> currentShift;
+        sumAbs = MIN(31, sumAbs);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+        state->m_goRicePar[state_id] += currentShift;
+      }
+      else {
+        sumAbs = MIN(31, sumAbs);
+        state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+      }
+      state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
+    }
+  }
+  else {
+    state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
+    state->all_lt_four &= state->m_remRegBins[state_id] < 4;
+  }
+}
+
+
+int uvg_dep_quant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const int width,
+  const int height,
+  const coeff_t* srcCoeff,
+  coeff_t* coeff_out,
+  const color_t compID,
+  enum uvg_tree_type tree_type,
+  int* absSum,
+  const bool enableScalingLists)
+{
+  const encoder_control_t* const encoder = state->encoder_control;
+  //===== reset / pre-init =====
+  const int baseLevel = 4;
+  context_store dep_quant_context;
+  dep_quant_context.m_curr_state_offset = 0;
+  dep_quant_context.m_prev_state_offset = 4;
+  dep_quant_context.m_skip_state_offset = 8;
+   
+  const uint32_t  lfnstIdx = tree_type != UVG_CHROMA_T  || compID == COLOR_Y ?
+                               cur_tu->lfnst_idx :
+                               cur_tu->cr_lfnst_idx;
+  
+  const int       numCoeff = width * height;
+
+  memset(coeff_out, 0x00, width * height * sizeof(coeff_t));
+  *absSum                    = 0;
+
+  const bool      is_mts   = compID == COLOR_Y && cur_tu->tr_idx > MTS_SKIP;
+  const bool      is_ts    = (cur_tu->tr_skip >> compID) & 1;
+
+  const uint32_t  log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* const scan     = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height);
+  const uint32_t* const cg_scan     = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED,0,log2_tr_width,log2_tr_height);
+
+  int32_t qp_scaled = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  qp_scaled = is_ts ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
+  bool needs_block_size_trafo_scale = !is_ts && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+
+  const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID;
+  const int32_t *q_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
+
+  if (compID != COLOR_Y) {
+    dep_quant_context.m_quant = (quant_block*)& state->quant_blocks[2];
+  } else if (cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP) {
+    dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[1];    
+  } else {
+    dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[0];   
+  }
+  //TODO: no idea when it is safe not to reinit for inter
+  if (dep_quant_context.m_quant->needs_init || cur_tu->type == CU_INTER) {
+    init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1);
+  }
+  
+  //===== scaling matrix ====
+  //const int         qpDQ = cQP.Qp + 1;
+  //const int         qpPer = qpDQ / 6;
+  //const int         qpRem = qpDQ - 6 * qpPer;
+
+  //TCoeff thresTmp = thres;
+  bool            zeroOut         = false;
+  bool            zeroOutforThres = false;
+  int             effWidth = width, effHeight = height;
+  if (
+    (is_mts ||
+     (state->encoder_control->cfg.mts && 0 /*sbt used by block*/ &&
+      height <= 32 && width <= 32)) &&
+    compID == COLOR_Y) {
+    effHeight = (height == 32) ? 16 : height;
+    effWidth  = (width == 32) ? 16 : width;
+    zeroOut   = (effHeight < height || effWidth < width);
+  }
+  zeroOutforThres  = zeroOut || (32 < height || 32 < width);
+  //===== find first test position =====
+  int firstTestPos = numCoeff - 1;
+  if (
+    lfnstIdx > 0 && !is_ts && width >= 4 &&
+    height >= 4) {
+    firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
+  }
+  uvg_find_first_non_zero_coeff(
+    srcCoeff,
+    enableScalingLists,
+    &dep_quant_context,
+    scan,
+    q_coeff,
+    &firstTestPos,
+    width, 
+    height);
+  if (firstTestPos < 0) {
+    return 0;
+  }
+
+  //===== real init =====
+  rate_estimator_t* rate_estimator = (rate_estimator_t *)(compID == COLOR_Y && cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP ?
+    &state->rate_estimator[3] : &state->rate_estimator[compID]);
+  if(rate_estimator->needs_init || cur_tu->type == CU_INTER) {
+    init_rate_esimator(rate_estimator, &state->search_cabac, compID);
+    xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID);
+    rate_estimator->needs_init = false;
+  } else if (compID == COLOR_U && state->encoder_control->cfg.jccr) {
+    xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID);    
+  }
+
+  reset_common_context(&dep_quant_context.m_common_context, rate_estimator, (width * height) >> 4, numCoeff);
+  dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height];
+  
+
+  int effectHeight = MIN(32, effHeight);
+  int effectWidth = MIN(32, effWidth);
+  for (int k = 0; k < 12; k++) {
+    dep_quant_context.m_allStates.m_rdCost[k] = INT64_MAX >> 1;
+    dep_quant_context.m_allStates.m_numSigSbb[k] = 0;
+    dep_quant_context.m_allStates.m_remRegBins[k] = 4; // just large enough for last scan pos
+    dep_quant_context.m_allStates.m_refSbbCtxId[k] = -1;
+    dep_quant_context.m_allStates.m_sigFracBits[k][0] = rate_estimator->m_sigFracBits[0][0][0];
+    dep_quant_context.m_allStates.m_sigFracBits[k][1] = rate_estimator->m_sigFracBits[0][0][1];
+    memcpy(dep_quant_context.m_allStates.m_coeffFracBits[k], rate_estimator->m_gtxFracBits[0], sizeof(dep_quant_context.m_allStates.m_coeffFracBits[k]));
+    dep_quant_context.m_allStates.m_goRicePar[k] = 0;
+    dep_quant_context.m_allStates.m_goRiceZero[k] = 0;
+
+    dep_quant_context.m_allStates.m_sbbFracBits[k][0] = 0;
+    dep_quant_context.m_allStates.m_sbbFracBits[k][1] = 0;
+
+    dep_quant_context.m_allStates.m_stateId[k] = k & 3;
+    for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
+      memcpy(dep_quant_context.m_allStates.m_sigFracBitsArray[k][i], rate_estimator->m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i], sizeof(uint32_t) * 2);
+    }
+  }
+
+  dep_quant_context.m_allStates.effHeight = effectHeight;
+  dep_quant_context.m_allStates.effWidth = effectWidth;
+  dep_quant_context.m_allStates.all_gte_four = true;
+  dep_quant_context.m_allStates.all_lt_four = false;
+  dep_quant_context.m_allStates.m_commonCtx = &dep_quant_context.m_common_context;
+  for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
+    memcpy(dep_quant_context.m_allStates.m_gtxFracBitsArray[i], rate_estimator->m_gtxFracBits[i], sizeof(int32_t) * 6);
+  }
+
+  depquant_state_init(&dep_quant_context.m_startState, rate_estimator->m_sigFracBits[0][0], rate_estimator->m_gtxFracBits[0]);
+  dep_quant_context.m_startState.effHeight = effectHeight;
+  dep_quant_context.m_startState.effWidth = effectWidth;
+  dep_quant_context.m_startState.m_stateId = 0;
+  dep_quant_context.m_startState.m_commonCtx = &dep_quant_context.m_common_context;
+  for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) {
+    dep_quant_context.m_startState.m_sigFracBitsArray[i] = rate_estimator->m_sigFracBits[0][i];
+  }
+  for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) {
+    dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator->m_gtxFracBits[i];
+  }
+
+  const uint32_t height_in_sbb = MAX(height >> 2, 1);
+  const uint32_t width_in_sbb = MAX(width >> 2, 1);
+
+  const int      default_quant_coeff = dep_quant_context.m_quant->m_QScale;
+  //===== populate trellis =====
+  for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) {
+    uint32_t blkpos = scan[scanIdx];
+    struct dep_quant_scan_info* scan_info = &encoder->scan_info[log2_tr_width][log2_tr_height][scanIdx];
+
+    context_store* ctxs = &dep_quant_context;
+    if (enableScalingLists) {
+      init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
+
+      uvg_dep_quant_decide_and_update(
+        rate_estimator,
+        ctxs,
+        scan_info,
+        abs(srcCoeff[blkpos]),
+        scanIdx,
+        width_in_sbb,
+        height_in_sbb,
+        encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
+        (zeroOut && (scan_info->pos_x >= effWidth || scan_info->pos_y >= effHeight)),
+        q_coeff[blkpos],
+        width,
+        height,
+        compID != 0
+        ); //tu.cu->slice->getReverseLastSigCoeffFlag());
+    }
+    else {
+      uvg_dep_quant_decide_and_update(
+        rate_estimator,
+        ctxs,
+        scan_info,
+        abs(srcCoeff[blkpos]),
+        scanIdx,
+        width_in_sbb,
+        height_in_sbb,
+        encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0],
+        (zeroOut && (scan_info->pos_x >= effWidth || scan_info->pos_y >= effHeight)),
+        default_quant_coeff,
+        width,
+        height,
+        compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag());
+    }
+  }
+
+  //===== find best path =====
+  int prev_id    = -1;
+  int64_t  minPathCost = 0;  
+  for (int8_t stateId = 0; stateId < 4; stateId++) {
+    int64_t pathCost = dep_quant_context.m_trellis[0].rdCost[stateId];
+    if (pathCost < minPathCost) {
+      prev_id = stateId;
+      minPathCost     = pathCost;
+    }
+  }
+
+  //===== backward scanning =====
+  int scanIdx = 0;
+  context_store* ctxs = &dep_quant_context;
+  for (; prev_id >= 0; scanIdx++) {
+    Decision temp       = dep_quant_context.m_trellis[scanIdx];
+    int32_t blkpos = scan[scanIdx];
+    coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -temp.absLevel[prev_id] : temp.absLevel[prev_id]);
+    *absSum += temp.absLevel[prev_id];
+    prev_id = temp.prevId[prev_id];
+  }
+  return *absSum;
+}
+
+
+void uvg_dep_quant_dequant(
+  const encoder_state_t* const state,
+  const int block_type,
+  const int width,
+  const int height,
+  const color_t compID,
+  coeff_t* quant_coeff,
+  coeff_t * coeff, 
+  bool enableScalingLists)
+{
+  const encoder_control_t* const encoder = state->encoder_control;
+
+  const int       numCoeff = width * height;
+  
+  const uint32_t  log2_tr_width = uvg_g_convert_to_log2[width];
+  const uint32_t  log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, 0, log2_tr_width, log2_tr_height);
+  bool needs_block_size_trafo_scale =((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+
+  //----- reset coefficients and get last scan index -----
+  memset(coeff, 0, numCoeff * sizeof(coeff_t));
+  int lastScanIdx = -1;
+  for (int scanIdx = numCoeff - 1; scanIdx >= 0; scanIdx--)
+  {
+    if (quant_coeff[scan[scanIdx]])
+    {
+      lastScanIdx = scanIdx;
+      break;
+    }
+  }
+  if (lastScanIdx < 0)
+  {
+    return;
+  }
+
+  //----- set dequant parameters -----
+  const int         qpDQ = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]) + 1;
+  const int         qpPer = qpDQ / 6;
+  const int         qpRem = qpDQ - 6 * qpPer;
+  const int         channelBitDepth = encoder->bitdepth;
+  const int         maxLog2TrDynamicRange = MAX_TR_DYNAMIC_RANGE;
+  const coeff_t      minTCoeff = -(1 << maxLog2TrDynamicRange);
+  const coeff_t      maxTCoeff = (1 << maxLog2TrDynamicRange) - 1;
+  const int         transformShift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale;
+  int  shift = IQUANT_SHIFT + 1 - qpPer - transformShift + (enableScalingLists ? 4 : 0);
+  int  invQScale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale ? 1 : 0][qpRem];
+  int  add = (shift < 0) ? 0 : ((1 << shift) >> 1);
+  int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(compID);
+
+  const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qpDQ % 6];
+  //----- dequant coefficients -----
+  for (int state = 0, scanIdx = lastScanIdx; scanIdx >= 0; scanIdx--)
+  {
+    const unsigned  rasterPos = scan[scanIdx];
+    const coeff_t level = quant_coeff[rasterPos];
+    if (level)
+    {
+      if (enableScalingLists)
+      {
+        invQScale = dequant_coef[rasterPos];//scalingfactor*levelScale
+      }
+      if (shift < 0 && (enableScalingLists || scanIdx == lastScanIdx))
+      {
+        invQScale <<= -shift;
+      }
+      int  qIdx = (level << 1) + (level > 0 ? -(state >> 1) : (state >> 1));
+      int64_t  nomTCoeff = ((int64_t)qIdx * (int64_t)invQScale + add) >> ((shift < 0) ? 0 : shift);
+      coeff[rasterPos] = (coeff_t)CLIP(minTCoeff, maxTCoeff, nomTCoeff);
+    }
+    state = (32040 >> ((state << 2) + ((level & 1) << 1))) & 3;   // the 16-bit value "32040" represent the state transition table
+  }
+}
diff --git a/src/dep_quant.h b/src/dep_quant.h
new file mode 100644
index 00000000..6ef54f4d
--- /dev/null
+++ b/src/dep_quant.h
@@ -0,0 +1,247 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#ifndef DEP_QUANT_H_
+#define DEP_QUANT_H_
+
+#include "cu.h"
+#include "global.h"
+
+#define SM_NUM_CTX_SETS_SIG   3
+#define SM_NUM_CTX_SETS_GTX   2
+#define SM_MAX_NUM_SIG_SBB_CTX 2
+#define SM_MAX_NUM_SIG_CTX    12
+#define SM_MAX_NUM_GTX_CTX    21
+#define SCALE_BITS         15
+#define RICEMAX            32
+
+typedef struct encoder_control_t encoder_control_t;
+
+enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
+
+struct dep_quant_scan_info
+{
+  uint8_t sig_ctx_offset[2];
+  uint8_t gtx_ctx_offset[2];
+  uint16_t cg_pos;
+  uint16_t  pos_y;
+  uint16_t  pos_x;
+  uint8_t next_sbb_right;
+  uint8_t next_sbb_below;
+};
+
+typedef struct
+{
+  int     m_QShift;
+  int64_t m_QAdd;
+  int64_t m_QScale;
+  int64_t m_maxQIdx;
+  int64_t m_thresLast;
+  int64_t m_thresSSbb;
+  // distortion normalization
+  int     m_DistShift;
+  int64_t m_DistAdd;
+  int64_t m_DistStepAdd;
+  int64_t m_DistOrgFact;
+  bool    needs_init;
+} quant_block;
+
+typedef struct
+{
+  int32_t  m_lastBitsX[TR_MAX_WIDTH];
+  int32_t  m_lastBitsY[TR_MAX_WIDTH];
+  uint32_t m_sigSbbFracBits[SM_MAX_NUM_SIG_SBB_CTX][2];
+  uint32_t m_sigFracBits[SM_NUM_CTX_SETS_SIG][SM_MAX_NUM_SIG_CTX][2];
+  int32_t  m_gtxFracBits[SM_MAX_NUM_GTX_CTX][6];
+  bool     needs_init;
+} rate_estimator_t;
+
+
+typedef struct
+{
+  uint8_t num;
+  uint8_t inPos[5];
+} NbInfoSbb;
+
+typedef struct
+{
+  uint16_t maxDist;
+  uint16_t num;
+  uint16_t outPos[5];
+} NbInfoOut;
+
+typedef struct {
+  int32_t absLevel[4];
+  int64_t deltaDist[4];
+} PQData;
+
+typedef struct {
+  int64_t ALIGNED(32) rdCost[8];
+  int32_t ALIGNED(32) absLevel[8];
+  int32_t ALIGNED(32) prevId[8];
+} Decision;
+
+
+typedef struct {
+  uint8_t* sbbFlags;
+  uint8_t* levels;
+} SbbCtx;
+
+typedef struct {
+  const NbInfoOut* m_nbInfo;
+  uint32_t         m_sbbFlagBits[2][2];
+  SbbCtx           m_allSbbCtx[2];
+  int              m_curr_sbb_ctx_offset;
+  int              m_prev_sbb_ctx_offset;
+  uint8_t          sbb_memory[8 * 1024];
+  uint8_t          level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH];
+  int              num_coeff;
+} common_context;
+
+
+typedef struct {
+  int64_t  m_rdCost;
+  uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          m_numSigSbb;
+  int             m_remRegBins;
+  int8_t          m_refSbbCtxId;
+  uint32_t        m_sbbFracBits[2];
+  uint32_t        m_sigFracBits[2];
+  int32_t         m_coeffFracBits[6];
+  int8_t          m_goRicePar;
+  int8_t          m_goRiceZero;
+  int8_t          m_stateId;
+  uint32_t*       m_sigFracBitsArray[12];
+  int32_t*        m_gtxFracBitsArray[21];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+} depquant_state;
+typedef struct {
+  int64_t  ALIGNED(32) m_rdCost[12];
+  uint8_t  ALIGNED(32) m_absLevels[3][16 * 4]; 
+  uint16_t ALIGNED(32) m_ctxInit[3][16 * 4]; 
+  int8_t          ALIGNED(16) m_numSigSbb[12];
+  int             ALIGNED(32) m_remRegBins[12];
+  int8_t          ALIGNED(16) m_refSbbCtxId[12];
+  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
+  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
+  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
+  int8_t          ALIGNED(16) m_goRicePar[12];
+  int8_t          ALIGNED(16) m_goRiceZero[12];
+  int8_t          ALIGNED(16) m_stateId[12];
+  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
+  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+
+  bool            all_gte_four;
+  bool            all_lt_four;
+} all_depquant_states;
+
+typedef struct {
+  common_context      m_common_context;
+  all_depquant_states m_allStates;
+  int                 m_curr_state_offset;
+  int                 m_prev_state_offset;
+  int                 m_skip_state_offset;
+  depquant_state      m_startState;
+  quant_block*        m_quant;
+  Decision            m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
+} context_store;
+
+
+int uvg_init_nb_info(encoder_control_t* encoder);
+void uvg_dealloc_nb_info(encoder_control_t* encoder);
+
+
+void uvg_dep_quant_dequant(
+  const encoder_state_t* const state,
+  const int block_type,
+  const int width,
+  const int height,
+  const color_t compID,
+  coeff_t* quant_coeff,
+  coeff_t* coeff,
+  bool enableScalingLists);
+
+int uvg_dep_quant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const int width,
+  const int height,
+  const coeff_t* srcCoeff,
+  coeff_t* coeff_out,
+  const color_t compID,
+  enum uvg_tree_type tree_type,
+  int* absSum,
+  const bool enableScalingLists);
+
+
+void uvg_dep_quant_update_state(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id);
+
+
+void uvg_dep_quant_update_state_eos(
+  context_store*  ctxs,
+  const uint32_t  scan_pos,
+  const uint32_t  cg_pos,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const uint32_t  width_in_sbb,
+  const uint32_t  height_in_sbb,
+  const uint32_t  next_sbb_right,
+  const uint32_t  next_sbb_below,
+  const Decision* decisions,
+  int             decision_id);
+
+void uvg_dep_quant_check_rd_costs(
+  const all_depquant_states* const state,
+  const enum ScanPosType           spt,
+  const PQData*                    pqDataA,
+  Decision*                        decisions,
+  const int                        decisionA,
+  const int                        decisionB,
+  const int                        state_offset);
+#endif
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 7a3f401c..858d89f4 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -47,18 +47,19 @@
 #include "tables.h"
 #include "videoframe.h"
 
-bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu)
+bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu, const cu_loc_t*
+                        const cu_loc)
 {
   uint32_t ts_max_size = 1 << state->encoder_control->cfg.trskip_max_size; 
   const uint32_t max_size = 32; // CU::isIntra(cu) ? MTS_INTRA_MAX_CU_SIZE : MTS_INTER_MAX_CU_SIZE;
-  const uint32_t cu_width    = LCU_WIDTH >> pred_cu->depth;
-  const uint32_t cu_height   = LCU_WIDTH >> pred_cu->depth;
+  const uint32_t cu_width    = cu_loc->width;
+  const uint32_t cu_height   = cu_loc->height;
   //bool mts_allowed = cu.chType == CHANNEL_TYPE_LUMA && compID == COMPONENT_Y;
 
   uint8_t mts_type = state->encoder_control->cfg.mts;
   bool mts_allowed = mts_type == UVG_MTS_BOTH || (pred_cu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : pred_cu->type == CU_INTER && mts_type == UVG_MTS_INTER);
   mts_allowed &= cu_width <= max_size && cu_height <= max_size;
-  //mts_allowed &= !cu.ispMode; // ISP_TODO: Uncomment this when ISP is implemented.
+  mts_allowed &= pred_cu->type == CU_INTRA ? !pred_cu->intra.isp_mode : true;
   //mts_allowed &= !cu.sbtInfo;
   mts_allowed &= !(pred_cu->bdpcmMode && cu_width <= ts_max_size && cu_height <= ts_max_size);
   mts_allowed &= pred_cu->tr_idx != MTS_SKIP && !pred_cu->violates_mts_coeff_constraint && pred_cu->mts_last_scan_pos ;
@@ -66,14 +67,16 @@ bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pr
   return mts_allowed;
 }
 
-static void encode_mts_idx(encoder_state_t * const state,
+static void encode_mts_idx(
+  encoder_state_t * const state,
   cabac_data_t * const cabac,
-  const cu_info_t *const pred_cu)
+  const cu_info_t *const pred_cu,
+  const cu_loc_t* const cu_loc)
 {
   //TransformUnit &tu = *cu.firstTU;
   int mts_idx = pred_cu->tr_idx;
 
-  if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu) && mts_idx != MTS_SKIP
+  if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu, cu_loc) && mts_idx != MTS_SKIP
        && !pred_cu->violates_mts_coeff_constraint
        && pred_cu->mts_last_scan_pos       
     )
@@ -102,122 +105,67 @@ static void encode_mts_idx(encoder_state_t * const state,
   }
 }
 
-// ISP_TODO: move these defines to a proper place when ISP is implemented
-// As of now, these are only needed in lfnst checks
-#define NOT_INTRA_SUBPARTITIONS 0
-#define HOR_INTRA_SUBPARTITIONS 1
-#define VER_INTRA_SUBPARTITIONS 2
-#define NUM_INTRA_SUBPARTITIONS_MODES 3
-#define INTRA_SUBPARTITIONS_RESERVED 4
-#define TU_1D_HOR_SPLIT 8
-#define TU_1D_VER_SPLIT 9
 
-#define MIN_TB_SIZE_X 4
-#define MIN_TB_SIZE_Y 4
-
-static int get_isp_split_dim(const int width, const int height, const int isp_split_type)
-{
-  bool divide_tu_in_rows = isp_split_type == TU_1D_HOR_SPLIT;
-  uint32_t split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
-
-  if (divide_tu_in_rows)
-  {
-    split_dim_size = height;
-    non_split_dim_size = width;
-  }
-  else
-  {
-    split_dim_size = width;
-    non_split_dim_size = height;
-  }
-  
-  const unsigned min_num_samples_cu = 1 << ((uvg_math_floor_log2(MIN_TB_SIZE_Y) << 1));
-  const unsigned factor_to_min_samples = non_split_dim_size < min_num_samples_cu ? min_num_samples_cu >> uvg_math_floor_log2(non_split_dim_size) : 1;
-  partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
-
-  assert(!(uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples_cu)) && "Partition has less than minimum amount of samples.");
-  return partition_size;
-}
-
-static bool can_use_lfnst_with_isp(const int width, const int height, const int isp_split_type, const enum uvg_tree_type tree_type)
-{
-  if (tree_type == UVG_CHROMA_T) {
-    return false;
-  }
-  if (isp_split_type == NOT_INTRA_SUBPARTITIONS) {
-    return false;
-  }
-
-  const int tu_width = (isp_split_type == HOR_INTRA_SUBPARTITIONS) ? width : get_isp_split_dim(width, height, TU_1D_VER_SPLIT);
-  const int tu_height = (isp_split_type == HOR_INTRA_SUBPARTITIONS) ? get_isp_split_dim(width, height, TU_1D_HOR_SPLIT) : height;
-
-  if (!(tu_width >= MIN_TB_SIZE_Y && tu_height >= MIN_TB_SIZE_Y))
-  {
-    return false;
-  }
-  return true;
-}
-
- bool uvg_is_lfnst_allowed(
+bool uvg_is_lfnst_allowed(
   const encoder_state_t* const state,
   const cu_info_t* const pred_cu,
-  const int width,
-  const int height,
-  const int x,
-  const int y,
   enum uvg_tree_type tree_type,
   const color_t color,
-  const lcu_t* lcu) 
+  const cu_loc_t* const cu_loc, const lcu_t* const lcu) 
 {
-  if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA) {
-    const int isp_mode = 0; // ISP_TODO: assign proper ISP mode when ISP is implemented
-    const int isp_split_type = 0;
-    const int depth = pred_cu->depth;
-    const int chroma_width = width >> 1;
-    const int chroma_height = height >> 1;
-    const int cu_width = tree_type != UVG_LUMA_T || depth == 4 ? width : chroma_width;
-    const int cu_height = tree_type != UVG_LUMA_T || depth == 4 ? height : chroma_height;
-    bool can_use_lfnst_with_mip = (width >= 16 && height >= 16);
-    bool is_sep_tree = depth == 4 || tree_type != UVG_BOTH_T;
+  if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && PU_IS_TU(pred_cu)) {
+    const int isp_mode = pred_cu->intra.isp_mode;
+    const int cu_width  = tree_type != UVG_CHROMA_T ? 1 << pred_cu->log2_width : 1 << pred_cu->log2_chroma_width;
+    const int cu_height = tree_type != UVG_CHROMA_T ? 1 << pred_cu->log2_height : 1 << pred_cu->log2_chroma_height;
+    bool can_use_lfnst_with_mip = (cu_width >= 16 && cu_height >= 16);
+    bool is_sep_tree = tree_type != UVG_BOTH_T;
     bool mip_flag = pred_cu->type == CU_INTRA && color == COLOR_Y ? pred_cu->intra.mip_flag : false;
 
-    if ((isp_mode && !can_use_lfnst_with_isp(width, height, isp_split_type, tree_type)) ||
-      (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip) || 
+    if ((isp_mode && !uvg_can_use_isp_with_lfnst(cu_width, cu_height, isp_mode, tree_type) && color == COLOR_Y) ||
+      (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip && color == COLOR_Y) ||
       (is_sep_tree && MIN(cu_width, cu_height) < 4) || 
-      (cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH)) {
+      (cu_width > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)) || cu_height > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)))) {
       return false;
     }
-    bool luma_flag = (depth == 4 && color == COLOR_Y) || (tree_type != UVG_CHROMA_T && depth != 4);
-    bool chroma_flag = (depth == 4 && color != COLOR_Y) || tree_type != UVG_LUMA_T;
-    bool non_zero_coeff_non_ts_corner_8x8 = (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
+    bool luma_flag = tree_type != UVG_CHROMA_T;
+    bool chroma_flag = tree_type != UVG_LUMA_T;
+    bool non_zero_coeff_non_ts_corner_8x8 = false;
+    bool last_scan_pos = false;
     bool is_tr_skip = false;
-
+    
+    int split_num = color == COLOR_Y && isp_mode ? uvg_get_isp_split_num(cu_width, cu_height, isp_mode, false) : 0;
     const videoframe_t* const frame = state->tile->frame;
-    //const int num_pred_units = kvz_part_mode_num_parts[pred_cu->part_size];
-    const int tr_depth = pred_cu->tr_depth;
-    assert(depth <= tr_depth && "Depth greater than transform depth. This should never trigger.");
-    const int num_transform_units = 1 << (2 * (tr_depth - depth));
-    const int tu_row_length = 1 << (tr_depth - depth);
-    const int tu_width = cu_width >> (tr_depth - depth);
-    const int tu_height = tu_width; // TODO: height for non-square blocks
 
-    // TODO: chroma transform skip
-    if (color == COLOR_Y) {
-      for (int i = 0; i < num_transform_units; i++) {
-        // TODO: this works only for square blocks
-        const int tu_x = x + ((i % tu_row_length) * tu_width);
-        const int tu_y = y + ((i / tu_row_length) * tu_height);
-        const cu_info_t* cur_tu = lcu ? LCU_GET_CU_AT_PX(lcu, tu_x, tu_y) : uvg_cu_array_at_const(frame->cu_array, tu_x, tu_y);
-        assert(cur_tu != NULL && "NULL transform unit.");
-        bool cbf_set = cbf_is_set(cur_tu->cbf, tr_depth, COLOR_Y);
+    if (split_num) {
+      // Constraints for ISP split blocks
+      for (int i = 0; i < split_num; ++i) {
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_width, cu_height, i, isp_mode, false);
+        int local_split_x = lcu ? split_loc.local_x : split_loc.x;
+        int local_split_y = lcu ? split_loc.local_y : split_loc.y;
+        uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y, MAX(cu_width, cu_height));
+        const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) :
+          uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y);
 
-        if (cur_tu != NULL && cbf_set && cur_tu->tr_idx == MTS_SKIP) {
-          is_tr_skip = true;
+        //if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) {
+        // ISP_TODO: remove this if clause altogether if it seems it is not needed
+        if (true) {
+          non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && split_cu->violates_lfnst_constrained_luma) || (chroma_flag && split_cu->violates_lfnst_constrained_chroma);
+          //last_scan_pos |= split_cu->lfnst_last_scan_pos;
+          last_scan_pos |= true;
         }
       }
     }
+    else {
+      non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma);
+      last_scan_pos |= pred_cu->lfnst_last_scan_pos;
+    }
 
-    if ((!pred_cu->lfnst_last_scan_pos && !isp_mode) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) {
+    if (color == COLOR_Y && pred_cu->tr_idx == MTS_SKIP) {
+      is_tr_skip = true;
+    }
+
+    if ((!last_scan_pos) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) {
       return false;
     }
     return true;
@@ -231,19 +179,15 @@ static bool encode_lfnst_idx(
   const encoder_state_t* const state,
   cabac_data_t * const cabac,
   const cu_info_t * const pred_cu,
-  const int x,
-  const int y,
-  const int depth,
-  const int width,
-  const int height,
   enum uvg_tree_type tree_type,
-  const color_t color)
+  const color_t color,
+  const cu_loc_t* const cu_loc)
 {
   
-  if (uvg_is_lfnst_allowed(state, pred_cu, width, height, x, y, tree_type, color, NULL)) {
+  if (uvg_is_lfnst_allowed(state, pred_cu, tree_type, color, cu_loc, NULL)) {
     // Getting separate tree bool from block size is a temporary fix until a proper dual tree check is possible (there is no dual tree structure at time of writing this).
     // VTM seems to force explicit dual tree structure for small 4x4 blocks
-    bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T;
+    bool is_separate_tree = tree_type != UVG_BOTH_T;
 
     const int lfnst_index = !is_separate_tree || color == COLOR_Y ? pred_cu->lfnst_idx : pred_cu->cr_lfnst_idx;
     assert((lfnst_index >= 0 && lfnst_index < 3) && "Invalid LFNST index.");
@@ -261,6 +205,12 @@ static bool encode_lfnst_idx(
     return true;
   }
   else {
+    if(color == COLOR_Y) {
+      assert(pred_cu->lfnst_idx == 0);
+    }
+    if(tree_type == UVG_CHROMA_T && color != COLOR_Y) {
+      assert(pred_cu->cr_lfnst_idx == 0);
+    }
     return false;
   }
 }
@@ -269,9 +219,11 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
   const coeff_t* coeff,
   uint32_t width,
+  uint32_t height,
   uint8_t type,
   int8_t scan_mode,
-  double* bits_out) {
+  double* bits_out) 
+{
   //const encoder_control_t * const encoder = state->encoder_control;
   //int c1 = 1;
   uint32_t i;
@@ -282,10 +234,14 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   // CONSTANTS
 
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t* scan =    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  // TODO: log2_cg_size is wrong if width != height
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
+
   double bits = 0;
 
   // Init base contexts according to block type
@@ -293,23 +249,23 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   cabac->cur_ctx = base_coeff_group_ctx;
   
-  int maxCtxBins = (width * width * 7) >> 2;
+  int maxCtxBins = (width * height * 7) >> 2;
   unsigned scan_cg_last = (unsigned )-1;
   //unsigned scan_pos_last = (unsigned )-1;
 
-  for (i = 0; i < width * width; i++) {
+  for (i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
-      //scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
   }
-  scan_cg_last = (width * width - 1) >> log2_cg_size;
+  // TODO: this won't work with non-square blocks
+  scan_cg_last = (width * height - 1) >> log2_cg_size;
   const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
 
   bool no_sig_group_before_last = true;
 
   for (i = 0; i <= scan_cg_last; i++) {
-    if (!(width == 4 || (i ==scan_cg_last && no_sig_group_before_last))) {
+    if (!((width == 4 && height == 4) || (i ==scan_cg_last && no_sig_group_before_last))) {
       uint32_t cg_blkpos = scan_cg[i];
       uint32_t cg_pos_y = cg_blkpos / cg_width;
       uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * cg_width);
@@ -462,13 +418,13 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
                                     uint8_t type, uint8_t scan, double* bits_out)
 {
   const int index_x = uvg_math_floor_log2(width);
-  const int index_y = uvg_math_floor_log2(width);
+  const int index_y = uvg_math_floor_log2(height);
   const int prefix_ctx[8] = { 0, 0, 0, 3, 6, 10, 15, 21 };
   //ToDo: own ctx_offset and shift for X and Y 
   uint8_t ctx_offset_x = type ? 0 : prefix_ctx[index_x];
   uint8_t ctx_offset_y = type ? 0 : prefix_ctx[index_y];
-  uint8_t shift_x = type ? CLIP(0, 2, width>>3) : (index_x+1)>>2;
-  uint8_t shift_y = type ? CLIP(0, 2, width >> 3) : (index_y + 1) >> 2;
+  uint8_t shift_x = type ? CLIP(0, 2, width >> 3) : (index_x + 1) >> 2;
+  uint8_t shift_y = type ? CLIP(0, 2, height >> 3) : (index_y + 1) >> 2;
   double bits = 0;
 
   cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
@@ -515,107 +471,130 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
 
 static void encode_chroma_tu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
-  const uint8_t width_c,
+  const cu_loc_t * const cu_loc,
   cu_info_t* cur_pu,
   int8_t* scan_idx,
   lcu_coeff_t* coeff,
-  uint8_t joint_chroma,
-  enum
-  uvg_tree_type tree_type)
+  uint8_t joint_chroma)
 {
-  int x_local = ((x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
-  int y_local = ((y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
+  int width_c = cu_loc->chroma_width;
+  int height_c = cu_loc->chroma_height;
+  int x_local = (cu_loc->x >> 1) % LCU_WIDTH_C;
+  int y_local = (cu_loc->y >> 1) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
-  *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
+  *scan_idx = SCAN_DIAG;
   if(!joint_chroma){
-    const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-    const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    // const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    // const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    coeff_t coeff_u[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    coeff_t coeff_v[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    uvg_get_sub_coeff(coeff_u, coeff->u, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
+    uvg_get_sub_coeff(coeff_v, coeff->v, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
 
-    if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
-      if(state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
+    if (cbf_is_set(cur_pu->cbf, COLOR_U)) {
+      if(state->encoder_control->cfg.trskip_enable 
+        && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
+        && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)){
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         // HEVC only supports transform_skip for Luma
         // TODO: transform skip for chroma blocks
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, cu_loc, COLOR_U, *scan_idx, cur_pu, NULL);
     }
 
-    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
-      if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    if (cbf_is_set(cur_pu->cbf, COLOR_V)) {
+      if (state->encoder_control->cfg.trskip_enable 
+        && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
+        && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, cu_loc, COLOR_V, *scan_idx, cur_pu, NULL);
     }
   }
   else {
-    const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-    if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
+    if (state->encoder_control->cfg.trskip_enable 
+      && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
-    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, cu_loc, COLOR_V, *scan_idx, cur_pu, NULL);
     
   }
 }
 
 static void encode_transform_unit(
   encoder_state_t * const state,
-  int x,
-  int y,
-  int depth,
-  bool only_chroma,
+  const cu_loc_t *cu_loc,
+  const cu_info_t* cur_pu,
   lcu_coeff_t* coeff,
-  enum uvg_tree_type tree_type)
+  bool only_chroma,
+  enum uvg_tree_type tree_type,
+  bool last_split,
+  const cu_loc_t *original_loc,
+  const cu_loc_t* const chroma_loc)               // Original cu dimensions, before CU split
 {
-  assert(depth >= 1 && depth <= MAX_PU_DEPTH);
-
   const videoframe_t * const frame = state->tile->frame;
   cabac_data_t* const cabac = &state->cabac;
-  const uint8_t width = LCU_WIDTH >> depth;
-  const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const uint8_t width = cu_loc->width;
+  const uint8_t height = cu_loc->height;
+  const uint8_t width_c  = cu_loc->chroma_width;
+  const uint8_t height_c = cu_loc->chroma_height;
 
   cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y);
+  int isp_x = x;
+  int isp_y = y;
+  uvg_get_isp_cu_arr_coords(&isp_x, &isp_y, MAX(width, height));
+  if(cur_pu == NULL) {
+    cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y);
+  }
 
-  int8_t scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
+  int8_t scan_idx = SCAN_DIAG;
 
-  int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
+  int cbf_y = cbf_is_set(cur_pu->cbf, COLOR_Y);
 
   if (cbf_y && !only_chroma) {
     int x_local = x % LCU_WIDTH;
     int y_local = y % LCU_WIDTH;
-    const coeff_t *coeff_y = &coeff->y[xy_to_zorder(LCU_WIDTH, x_local, y_local)];
+    // const coeff_t *coeff_y = &coeff->y[xy_to_zorder(LCU_WIDTH, x_local, y_local)];
+    coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    uvg_get_sub_coeff(coeff_y, coeff->y, x_local, y_local, width, height, LCU_WIDTH);
 
     // CoeffNxN
     // Residual Coding
 
-    if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    if(state->encoder_control->cfg.trskip_enable 
+      && width <= (1 << state->encoder_control->cfg.trskip_max_size) 
+      && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP)) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_luma;
       CABAC_BIN(cabac, cur_pu->tr_idx == MTS_SKIP, "transform_skip_flag");
-      DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
+      DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, height, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
     }
     if(cur_pu->tr_idx == MTS_SKIP) {
-      uvg_encode_ts_residual(state, cabac, coeff_y, width, 0, scan_idx, NULL);      
+      uvg_encode_ts_residual(state, cabac, coeff_y, width, height, 0, scan_idx, NULL);      
     }
     else {
       uvg_encode_coeff_nxn(state,
                            cabac,
                            coeff_y,
-                           width,
+                           cu_loc,
                            0,
                            scan_idx,
                            (cu_info_t * )cur_pu,
                            NULL);
     }
+    if (tree_type == UVG_LUMA_T) return;
   }
 
   bool joint_chroma = cur_pu->joint_cb_cr != 0;
-  if (depth == MAX_DEPTH) {
+  if (cur_pu->log2_height + cur_pu->log2_width < 6 && tree_type != UVG_CHROMA_T && !only_chroma) {
     // For size 4x4 luma transform the corresponding chroma transforms are
     // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
     // the last transform unit.
@@ -629,11 +608,12 @@ static void encode_transform_unit(
     }
   }
 
-  bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) ||
-                        cbf_is_set(cur_pu->cbf, depth, COLOR_V);
-  if (chroma_cbf_set || joint_chroma) {
+  bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, COLOR_U) ||
+                        cbf_is_set(cur_pu->cbf, COLOR_V);
+  if ((chroma_cbf_set || joint_chroma) && last_split && chroma_loc) {
     //Need to drop const to get lfnst constraints
-    encode_chroma_tu(state, x, y, depth, width_c, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type);
+    // Use original dimensions instead of ISP split dimensions
+    encode_chroma_tu(state, chroma_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma);
   }
 }
 
@@ -642,120 +622,104 @@ static void encode_transform_unit(
  * \param x_pu            Prediction units' x coordinate.
  * \param y_pu            Prediction units' y coordinate.
  * \param depth           Depth from LCU.
- * \param tr_depth        Depth from last CU.
  * \param parent_coeff_u  What was signaled at previous level for cbf_cb.
  * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
  */
 static void encode_transform_coeff(
   encoder_state_t * const state,
-  int32_t x,
-  int32_t y,
-  int8_t depth,
-  int8_t tr_depth,
-  uint8_t parent_coeff_u,
-  uint8_t parent_coeff_v,
+  const cu_loc_t * cu_loc,
   bool only_chroma,
   lcu_coeff_t* coeff,
-  enum uvg_tree_type tree_type)
+  const cu_info_t* cur_tu,
+  enum uvg_tree_type tree_type,
+  bool last_split,
+  bool can_skip_last_cbf,
+  int *luma_cbf_ctx,
+  // Always true except when writing sub partition coeffs (ISP)
+  const cu_loc_t * const original_loc,
+  const cu_loc_t* const chroma_loc)       // Original dimensions before ISP split
 {
   cabac_data_t * const cabac = &state->cabac;
+
+  bool isp_split = cu_loc->x != original_loc->x || cu_loc->y != original_loc->y;
+  int x = cu_loc->x;
+  int y = cu_loc->y;
+  if (isp_split) {
+    uvg_get_isp_cu_arr_coords(&x, &y, MAX(cu_loc->width, cu_loc->height));
+  }
+
   //const encoder_control_t *const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-
-  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, x, y);
-  // Round coordinates down to a multiple of 8 to get the location of the
-  // containing CU.
-  const int x_cu = 8 * (x / 8);
-  const int y_cu = 8 * (y / 8);
-  const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y);
-
-  // NxN signifies implicit transform split at the first transform level.
-  // There is a similar implicit split for inter, but it is only used when
-  // transform hierarchy is not in use.
-  //int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
-
-  // The implicit split by intra NxN is not counted towards max_tr_depth.
-  /*
-  int max_tr_depth;
-  if (cur_cu->type == CU_INTRA) {
-    max_tr_depth = ctrl->cfg.tr_depth_intra + intra_split_flag;
-  } else {
-    max_tr_depth = ctrl->tr_depth_inter;
+  if(cur_tu == NULL) {
+    cur_tu = uvg_cu_array_at_const(used_array, x, y);
   }
-  */
 
-  int8_t split = (LCU_WIDTH >> depth > TR_MAX_WIDTH);
+  const int tr_limit = TR_MAX_WIDTH;
+  const bool ver_split = cu_loc->height > tr_limit;
+  const bool hor_split = cu_loc->width > tr_limit;
 
- 
+  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_tu->cbf, COLOR_Y) : 0;
+  const int cb_flag_u = tree_type != UVG_LUMA_T ?(cur_tu->joint_cb_cr ? (cur_tu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_tu->cbf, COLOR_U)) : 0;
+  const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_tu->joint_cb_cr ? cur_tu->joint_cb_cr & 1 : cbf_is_set(cur_tu->cbf, COLOR_V)) : 0;
 
-  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0;
-  const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0;
-  const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V)) : 0;
 
-  // The split_transform_flag is not signaled when:
-  // - transform size is greater than 32 (depth == 0)
-  // - transform size is 4 (depth == MAX_PU_DEPTH)
-  // - transform depth is max
-  // - cu is intra NxN and it's the first split
-  
-  //ToDo: check BMS transform split in QTBT
-  /*
-  if (depth > 0 &&
-      depth < MAX_PU_DEPTH &&
-      tr_depth < max_tr_depth &&
-      !(intra_split_flag && tr_depth == 0))
-  {
-    cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model[5 - ((uvg_g_convert_to_bit[LCU_WIDTH] + 2) - depth)]);
-    CABAC_BIN(cabac, split, "split_transform_flag");
-  }
-  */
-
-  // Chroma cb flags are not signaled when one of the following:
-  // - transform size is 4 (2x2 chroma transform doesn't exist)
-  // - they have already been signaled to 0 previously
-  // When they are not present they are inferred to be 0, except for size 4
-  // when the flags from previous level are used.
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T) {
-    
-    if (!split) {
-      if (true) {
-        assert(tr_depth < 5);
-        cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
-        CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
-      }
-      if (true) {
-        cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
-        CABAC_BIN(cabac,  cb_flag_v, "cbf_cr");
-      }
+  if (hor_split || ver_split) {
+    enum split_type split;
+    if (cu_loc->width > tr_limit && cu_loc->height > tr_limit) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > tr_limit) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
     }
-  }
 
-  if (split) {
-    uint8_t offset = LCU_WIDTH >> (depth + 1);
-    int x2 = x + offset;
-    int y2 = y + offset;
-    encode_transform_coeff(state, x,  y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
-    encode_transform_coeff(state, x2, y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
-    encode_transform_coeff(state, x,  y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
-    encode_transform_coeff(state, x2, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      encode_transform_coeff(state, &split_cu_loc[i], only_chroma,
+        coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], chroma_loc ? &split_cu_loc[i] : NULL);
+    }
     return;
   }
 
+
+  // Chroma cb flags are not signaled when one of the following:
+  // No chroma.
+  // Not the last CU for area of 64 pixels cowered by more than one luma CU.
+  // Not the last ISP Split
+  if (state->encoder_control->chroma_format != UVG_CSP_400
+    && (chroma_loc || only_chroma)
+    && tree_type != UVG_LUMA_T
+    && last_split) {
+    cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
+    cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
+    CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
+  }
   // Luma coded block flag is signaled when one of the following:
   // - prediction mode is intra
   // - transform depth > 0
   // - we have chroma coefficients at this level
   // When it is not present, it is inferred to be 1.
-  if ((cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
-      cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+  if ((cur_tu->type == CU_INTRA || !PU_IS_TU(cur_tu) || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) {
+    if (can_skip_last_cbf && isp_split && last_split) {
+      // Do not write luma cbf if first three isp splits have luma cbf 0
+    } else {
+      cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
+      if (PU_IS_TU(cur_tu)) {
+        *luma_cbf_ctx = 2 + cb_flag_y;
+      }
+    }
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
-    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || depth != 4) ) {
-      const int qp_pred      = uvg_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
-      const int qp_delta     = cur_cu->qp - qp_pred;
+    if (state->must_code_qp_delta && (only_chroma || cb_flag_y || chroma_loc) ) {
+      const int qp_pred      = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, state->last_qp);
+      const int qp_delta     = cur_tu->qp - qp_pred;
       // Possible deltaQP range depends on bit depth as stated in HEVC specification.
       assert(qp_delta >= UVG_QP_DELTA_MIN && qp_delta <= UVG_QP_DELTA_MAX && "QP delta not in valid range.");
 
@@ -778,16 +742,18 @@ static void encode_transform_coeff(
     }
     if((
         ((cb_flag_u || cb_flag_v ) 
-          && cur_cu->type == CU_INTRA)
+          && cur_tu->type == CU_INTRA)
         || (cb_flag_u && cb_flag_v)) 
-      && (depth != 4 || only_chroma || tree_type == UVG_CHROMA_T) 
+      && (chroma_loc || only_chroma || tree_type == UVG_CHROMA_T)
       && state->encoder_control->cfg.jccr
+      && last_split
       ) {
-      assert(cur_pu->joint_cb_cr < 4 && "JointCbCr is in search state.");
+      assert(cur_tu->joint_cb_cr < 4 && "JointCbCr is in search state.");
       cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
-      CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
+      CABAC_BIN(cabac, cur_tu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
-    encode_transform_unit(state, x, y, depth, only_chroma, coeff, tree_type);
+
+    encode_transform_unit(state, cu_loc, only_chroma ? cur_tu : NULL, coeff, only_chroma, tree_type, last_split, original_loc, chroma_loc);
   }
 }
 
@@ -799,11 +765,13 @@ static void encode_transform_coeff(
  * \param depth           Depth from LCU.
  * \return if non-zero mvd is coded
  */
-int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
-                                      cabac_data_t * const cabac,
-                                      const cu_info_t * const cur_cu,
-                                      int x, int y, int width, int height,
-                                      int depth, lcu_t* lcu, double* bits_out)
+int uvg_encode_inter_prediction_unit(
+  encoder_state_t * const state,
+  cabac_data_t * const cabac,
+  const cu_info_t * const cur_cu,
+  lcu_t* lcu,
+  double* bits_out,
+  const cu_loc_t* const cu_loc)
 {
   // Mergeflag
   int16_t num_cand = 0;
@@ -838,8 +806,8 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
       // Code Inter Dir
       uint8_t inter_dir = cur_cu->inter.mv_dir;
 
-      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
-        uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height) + 1) >> 1));
+      if (cu_loc->width + cu_loc->height > 12) { // ToDo: limit on 4x8/8x4
+        uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(cu_loc->width) + uvg_math_floor_log2(cu_loc->height) + 1) >> 1));
 
         CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc");
       }
@@ -890,16 +858,14 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
         if (lcu) {
           uvg_inter_get_mv_cand(
             state, 
-            x, y, width, height,
-            mv_cand, cur_cu, 
-            lcu, ref_list_idx);
+            mv_cand, cur_cu, lcu, ref_list_idx,
+            cu_loc);
         }
         else {
           uvg_inter_get_mv_cand_cua(
             state,
-            x, y, width, height,
-            mv_cand, cur_cu, ref_list_idx
-          );
+            mv_cand, cur_cu, ref_list_idx, cu_loc
+            );
         }
 
         uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
@@ -922,14 +888,14 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
 }
 
 static void encode_chroma_intra_cu(
-  cabac_data_t* const cabac, 
-  const cu_info_t* const cur_cu, 
-  const int cclm_enabled, 
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  const int cclm_enabled,
+  int8_t luma_intra_dir,
   double* bits_out) {
   unsigned pred_mode = 0;
   unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83};
   int8_t chroma_intra_dir = cur_cu->intra.mode_chroma;
-  int8_t luma_intra_dir = !cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0;
   for(int i = 0; i < 4; i++) {
     if(chroma_pred_modes[i] == luma_intra_dir) {
       chroma_pred_modes[i] = 66;
@@ -1011,10 +977,13 @@ static void encode_chroma_intra_cu(
   else if (cabac->only_count && bits_out)*bits_out += bits;
 }
 
-void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
-                                     cabac_data_t * const cabac,
-                                     const cu_info_t * const cur_cu,
-                                     int x, int y, int depth, const lcu_t* lcu, double* bits_out)
+void uvg_encode_intra_luma_coding_unit(
+  const encoder_state_t * const state,
+  cabac_data_t * const cabac,
+  const cu_info_t * const cur_cu,
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  double* bits_out)
 {
   const videoframe_t * const frame = state->tile->frame;
   uint8_t intra_pred_mode_actual;
@@ -1026,6 +995,9 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   uint32_t flag;
   double bits = 0;
 
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
   /*
   if ((cur_cu->type == CU_INTRA && (LCU_WIDTH >> cur_cu->depth <= 32))) {
     cabac->cur_ctx = &(cabac->ctx.bdpcm_mode[0]);
@@ -1049,16 +1021,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
   */
   
-  // Intra Subpartition mode
-  uint32_t width = (LCU_WIDTH >> depth);
-  uint32_t height = (LCU_WIDTH >> depth);
-
-  bool enough_samples = uvg_g_convert_to_bit[width] + uvg_g_convert_to_bit[height] > (uvg_g_convert_to_bit[4 /* MIN_TB_SIZEY*/] << 1);
-  uint8_t isp_mode = 0;
-  // ToDo: add height comparison
-  //isp_mode += ((width > TR_MAX_WIDTH) || !enough_samples) ? 1 : 0;
-  //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0;
-  bool allow_isp = enough_samples;
+  uint32_t width = cu_loc->width;
+  uint32_t height = cu_loc->height; // TODO: height for non-square blocks
 
   // Code MIP related bits
   bool enable_mip = state->encoder_control->cfg.mip;
@@ -1083,9 +1047,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
 
   if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) {
-    const int cu_width = LCU_WIDTH >> depth;
-    const int cu_height = cu_width; // TODO: height for non-square blocks
-    uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, lcu, lcu ? NULL : frame->cu_array);
+    uint8_t ctx_id = uvg_get_mip_flag_context(cu_loc, lcu, lcu ? NULL : frame->cu_array);
 
     // Write MIP flag
     CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mip_flag[ctx_id]), mip_flag, bits, "mip_flag");
@@ -1104,7 +1066,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   int multi_ref_idx = enable_mrl ? cur_cu->intra.multi_ref_idx : 0;
   
 #ifdef UVG_DEBUG_PRINT_YUVIEW_CSV
-  if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, width, multi_ref_idx);
+  if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, height, multi_ref_idx);
 #endif
 
   if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) {
@@ -1116,21 +1078,21 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
     }
   }
 
+  bool enable_isp = state->encoder_control->cfg.isp;
+  // Need at least 16 samples in sub blocks to use isp. If both dimensions are 4, not enough samples. Blocks of size 2 do not exist yet (not for luma at least)
+  bool allow_isp = enable_isp ? uvg_can_use_isp(width, height) : false;
+  uint8_t isp_mode = allow_isp ? cur_cu->intra.isp_mode : 0;
 
-  // ToDo: update real usage, these if clauses as such don't make any sense
-  if (isp_mode != 0 && multi_ref_idx == 0) {
-    if (isp_mode) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]),  0, bits, "intra_subPartitions");
-    } else {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions");
-      // ToDo: complete this if-clause
-      if (isp_mode == 3) {
-        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor");
-      }
+  if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) {
+    if (isp_mode == ISP_MODE_NO_ISP) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subpartitions_mode");
+    }
+    else {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subpartitions_mode");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[1]), isp_mode - 1, bits, "intra_subpartitions_split_type"); // Vertical or horizontal split
     }
   }
-
-  const int cu_width = LCU_WIDTH >> depth;
+  
     // PREDINFO CODING
     // If intra prediction mode is found from the predictors,
     // it can be signaled with two EP's. Otherwise we can send
@@ -1145,7 +1107,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   if (x > 0) {
     assert(x >> 2 > 0);
     const int x_scu = SUB_SCU(x) - 1;
-    const int y_scu = SUB_SCU(y + cu_width - 1);
+    const int y_scu = SUB_SCU(y + height - 1);
     left_pu = lcu ?
                 LCU_GET_CU_AT_PX(
                   lcu,
@@ -1154,7 +1116,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
                 uvg_cu_array_at_const(
                   frame->cu_array,
                   x - 1,
-                  y + cu_width - 1);
+                  y + height - 1);
   }
   // Don't take the above PU across the LCU boundary.
   if (y % LCU_WIDTH > 0 && y > 0) {
@@ -1162,11 +1124,11 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
     above_pu = lcu ?
                  LCU_GET_CU_AT_PX(
                    lcu,
-                   SUB_SCU(x + cu_width - 1),
+                   SUB_SCU(x + width - 1),
                    SUB_SCU(y) - 1) :
                  uvg_cu_array_at_const(
                    frame->cu_array,
-                   x + cu_width - 1,
+                   x + width - 1,
                    y - 1);
   }
   
@@ -1185,8 +1147,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   }
   // Is the mode in the MPM array or not
   flag = (mpm_preds == -1) ? 0 : 1;
-  if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) {
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "prev_intra_luma_pred_flag");
+  if (cur_pu->intra.multi_ref_idx == 0) {
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "intra_luma_mpm_flag");
   }
     
   // Signal index of the prediction mode in the prediction list, if it is there
@@ -1262,144 +1224,139 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   if (cabac->only_count && bits_out) *bits_out += bits;
 }
 
-bool uvg_write_split_flag(
-  const encoder_state_t * const state,
+
+uint8_t uvg_write_split_flag(
+  const encoder_state_t* const state,
   cabac_data_t* cabac,
-  const cu_info_t * left_cu,
-  const cu_info_t * above_cu,
-  uint8_t split_flag,
-  int depth,
-  int cu_width,
-  int x,
-  int y,
+  const cu_info_t* left_cu,
+  const cu_info_t* above_cu,
+  const cu_loc_t* const cu_loc,
+  split_tree_t split_tree,
   enum uvg_tree_type tree_type,
+  bool* is_implicit_out,
   double* bits_out)
 {
-  uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
-  uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
   double bits = 0;
-  const encoder_control_t* const ctrl = state->encoder_control;
   // Implisit split flag when on border
   // Exception made in VVC with flag not being implicit if the BT can be used for
   // horizontal or vertical split, then this flag tells if QT or BT is used
+  const int cu_width =  cu_loc->width;
+  const int cu_height =  cu_loc->height;
 
-  bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
-  no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
-  if (depth > MAX_DEPTH) allow_qt = false;
-  // ToDo: update this when btt is actually used
-  bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
-  
 
-  uint8_t implicit_split_mode = UVG_NO_SPLIT;
-  //bool implicit_split = border;
-  bool bottom_left_available = ((abs_y + cu_width - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
-  bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T)));
+  bool can_split[6];
+  const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
 
-  if (!bottom_left_available && !top_right_available && allow_qt) {
-    implicit_split_mode = UVG_QUAD_SPLIT;
-  }
-  else if (!bottom_left_available && allow_btt) {
-    implicit_split_mode = UVG_HORZ_SPLIT;
-  }
-  else if (!top_right_available && allow_btt) {
-    implicit_split_mode = UVG_VERT_SPLIT;
-  }
-  else if (!bottom_left_available || !top_right_available) {
-    implicit_split_mode = UVG_QUAD_SPLIT;
-  }
-  
-  // Check split conditions
-  if (implicit_split_mode != UVG_NO_SPLIT) {
-    no_split = th_split = tv_split = false;
-    bh_split = (implicit_split_mode == UVG_HORZ_SPLIT);
-    bv_split = (implicit_split_mode == UVG_VERT_SPLIT);
-  }
 
-  if (!allow_btt) {
-    bh_split = bv_split = th_split = tv_split = false;
-  }
+  bool allow_split = can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
 
-  bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
+  enum split_type split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
 
-  split_flag |= implicit_split_mode != UVG_NO_SPLIT;
+  assert(can_split[split_flag] && "Trying to write an illegal split");
+
+  // split_flag = is_implicit ? (can_split[QT_SPLIT] ? QT_SPLIT : (can_split[BT_HOR_SPLIT] ? BT_HOR_SPLIT : BT_VER_SPLIT)) : split_flag;
+  *is_implicit_out = is_implicit;
 
   int split_model = 0;
-  if (no_split && allow_split) {
+  if (can_split[NO_SPLIT] && allow_split) {
     // Get left and top block split_flags and if they are present and true, increase model number
-    // ToDo: should use height and width to increase model, PU_GET_W() ?
-    if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) {
+    if (left_cu && (1 << left_cu->log2_height) < cu_height) {
       split_model++;
     }
 
-    if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) {
+    if (above_cu && (1 << above_cu->log2_width) < cu_width) {
       split_model++;
     }
 
     uint32_t split_num = 0;
-    if (allow_qt) split_num += 2;
-    if (bh_split) split_num++;
-    if (bv_split) split_num++;
-    if (th_split) split_num++;
-    if (tv_split) split_num++;
+    if (can_split[QT_SPLIT]) split_num += 2;
+    if (can_split[BT_HOR_SPLIT]) split_num++;
+    if (can_split[BT_VER_SPLIT]) split_num++;
+    if (can_split[TT_HOR_SPLIT]) split_num++;
+    if (can_split[TT_VER_SPLIT]) split_num++;
 
     if (split_num > 0) split_num--;
 
     split_model += 3 * (split_num >> 1);
 
     cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);
-    if(cabac->only_count && !split_flag) {
 
-      //printf("%hu %hu %d %d %d\n", state->search_cabac.ctx.split_flag_model[split_model].state[0], state->search_cabac.ctx.split_flag_model[split_model].state[1],
-      //  split_model, x, y);
-    }
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag, bits, "split_flag");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != NO_SPLIT, bits, "split_cu_flag");
   }
 
-  bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT;
 
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
-    split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag");
+  if ((!is_implicit || (can_split[QT_SPLIT] && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT]))) 
+    && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) 
+    && split_flag != NO_SPLIT) {
+    bool qt_split = split_flag == QT_SPLIT;
+    if((can_split[BT_VER_SPLIT] || can_split[BT_HOR_SPLIT] || can_split[TT_VER_SPLIT] || can_split[TT_HOR_SPLIT]) && can_split[QT_SPLIT]) {
+      unsigned left_qt_depth = 0;
+      unsigned top_qt_depth = 0;
+      if(left_cu) {
+        while (((left_cu->split_tree >> (left_qt_depth * 3)) & 7u) == QT_SPLIT) {
+          left_qt_depth++;
+        }
+      }
+      if(above_cu) {
+        while (((above_cu->split_tree >> (top_qt_depth * 3)) & 7u) == QT_SPLIT) {
+          top_qt_depth++;
+        }
+      }
+      split_model = (left_cu && (left_qt_depth > split_tree.current_depth)) + (above_cu && (top_qt_depth > split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3);
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "qt_split_flag");
+    }
+    if (!qt_split) {
+      const bool is_vertical = split_flag == BT_VER_SPLIT || split_flag == TT_VER_SPLIT;
+      if((can_split[BT_HOR_SPLIT] || can_split[TT_HOR_SPLIT]) && (can_split[BT_VER_SPLIT] || can_split[TT_VER_SPLIT])) {
+        split_model = 0;
+        if(can_split[BT_VER_SPLIT] + can_split[TT_VER_SPLIT] > can_split[BT_HOR_SPLIT] + can_split[TT_HOR_SPLIT]) {
+          split_model = 4;
+        } else if(can_split[BT_VER_SPLIT] + can_split[TT_VER_SPLIT] < can_split[BT_HOR_SPLIT] + can_split[TT_HOR_SPLIT]) {
+          split_model = 3;
+        } else {
+          const int d_a = cu_width / (above_cu ? (1 << above_cu->log2_width) : 1);
+          const int d_l = cu_height / (left_cu ? (1 << left_cu->log2_height) : 1);
+          if(d_a != d_l && above_cu && left_cu) {
+            split_model = d_a < d_l ? 1 : 2;
+          }
+        }
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag");
+      }
+      if ((can_split[BT_VER_SPLIT] && can_split[TT_VER_SPLIT] && is_vertical) || (can_split[BT_HOR_SPLIT] && can_split[TT_HOR_SPLIT] && !is_vertical)) {
+        split_model = (2 * is_vertical) + (split_tree.mtt_depth <= 1);
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), 
+          split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag");
+      }
+    }
   }
 
-  // Only signal split when it is not implicit, currently only Qt split supported
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) {
-
-    split_model = 0;
-
-    // Get left and top block split_flags and if they are present and true, increase model number
-    if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
-      split_model++;
-    }
-
-    if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
-      split_model++;
-    }
-
-    split_model += (depth > 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), split_flag, bits, "split_cu_mode");
-  }
   if (bits_out) *bits_out += bits;
   return split_flag;
 }
 
 void uvg_encode_coding_tree(
   encoder_state_t * const state,
-  uint16_t x,
-  uint16_t y,
-  uint8_t depth,
   lcu_coeff_t *coeff,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
+  split_tree_t split_tree,
+  bool has_chroma)
 {
   cabac_data_t * const cabac = &state->cabac;
   const encoder_control_t * const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
-  const cu_info_t *cur_cu   = uvg_cu_array_at_const(used_array, x, y);
+  
+  const int cu_width  = cu_loc->width;
+  const int cu_height = cu_loc->height;
+ 
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
 
-  const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth;
-  const int cu_height = cu_width; // TODO: height for non-square blocks
-  const int half_cu  = cu_width >> 1;
+  const cu_info_t* cur_cu = uvg_cu_array_at_const(used_array, x, y);
+
+  const int depth = split_tree.current_depth;
 
   const cu_info_t *left_cu  = NULL;
   if (x > 0) {
@@ -1412,53 +1369,60 @@ void uvg_encode_coding_tree(
 
 
   // Absolute coordinates
-  uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
-  uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
+  uint16_t abs_x = x + state->tile->offset_x;
+  uint16_t abs_y = y + state->tile->offset_y ;
 
-  int32_t frame_width = tree_type !=  UVG_CHROMA_T ? ctrl->in.width : ctrl->in.width / 2;
-  int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2;
-  // Check for slice border
-  bool border_x = frame_width  < abs_x + cu_width;
-  bool border_y = frame_height < abs_y + cu_width;
-  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
-  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
-  bool border = border_x || border_y; /*!< are we in any border CU */
+  int32_t frame_width =  ctrl->in.width;
+  int32_t frame_height =  ctrl->in.height;
+
+  // Stop if we are outside of the frame
+  if (abs_x >= frame_width || abs_y >= frame_height) return;
 
   if (depth <= state->frame->max_qp_delta_depth) {
     state->must_code_qp_delta = true;
   }
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
-  if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) {
-
-    const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, tree_type,NULL);
+  if (cu_width + cu_height > 8) {
+    split_tree.split_tree = cur_cu->split_tree;
+    bool is_implicit;
+    const int split_flag = uvg_write_split_flag(
+      state,
+      cabac,
+      left_cu,
+      above_cu, 
+      tree_type != UVG_CHROMA_T ? cu_loc : chroma_loc,
+      split_tree,
+      tree_type,
+      &is_implicit,
+      NULL
+      );
     
-    if (split_flag || border) {
-      // Split blocks and remember to change x and y block positions
-      uvg_encode_coding_tree(state, x, y, depth + 1, coeff, tree_type);
+    if (split_flag != NO_SPLIT) {
+      split_tree_t new_split_tree = { cur_cu->split_tree,
+        split_tree.current_depth + 1,
+        split_tree.mtt_depth + (split_flag != QT_SPLIT),
+        split_tree.implicit_mtt_depth + (split_flag != QT_SPLIT && is_implicit),
+      0};
 
-      if (!border_x || border_split_x) {
-        uvg_encode_coding_tree(state, x + half_cu, y, depth + 1, coeff, tree_type);
-      }
-      if (!border_y || border_split_y) {
-        uvg_encode_coding_tree(state, x, y + half_cu, depth + 1, coeff, tree_type);
-      }
-      if (!border || (border_split_x && border_split_y)) {
-        uvg_encode_coding_tree(state, x + half_cu, y + half_cu, depth + 1, coeff, tree_type);
+      cu_loc_t new_cu_loc[4];
+      uint8_t separate_chroma = 0;
+      const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma);
+      separate_chroma |= !has_chroma;
+      for (int split = 0; split <splits; ++split) {
+        new_split_tree.part_index = split;
+        uvg_encode_coding_tree(state, coeff, tree_type,
+          &new_cu_loc[split], 
+          separate_chroma ? chroma_loc : &new_cu_loc[split],
+          new_split_tree, !separate_chroma || (split == splits - 1 && has_chroma));
       }
       return;
     }
   }
+  
+  DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1);
 
-  //ToDo: check if we can actually split
-  //ToDo: Implement MT split
-  if (depth < MAX_PU_DEPTH)
-  {
-   // cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model[5 - ((uvg_g_convert_to_bit[LCU_WIDTH] + 2) - depth)]);
-   // CABAC_BIN(cabac, 0, "split_transform_flag");
-  }
-
-  DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, cur_cu->type-1);
+  // fprintf(stderr, "%4d %4d %2d %2d %d %d %d\n", x, y, cu_width, cu_height, has_chroma, tree_type, cur_cu->split_tree);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
@@ -1492,8 +1456,8 @@ void uvg_encode_coding_tree(
         cabac->cur_ctx = &(cabac->ctx.ibc_flag[ctx_ibc]);
         CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag");
       }
-      DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu);
-      uvg_hmvp_add_mv(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu);
+      DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_height, cur_cu);
+      uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu);
       int16_t num_cand = state->encoder_control->cfg.max_merge;
       if (num_cand > 1) {
         for (int ui = 0; ui < num_cand - 1; ui++) {
@@ -1510,8 +1474,8 @@ void uvg_encode_coding_tree(
         }
       }
 #ifdef UVG_DEBUG_PRINT_YUVIEW_CSV
-      if (cur_cu->inter.mv_dir & 1) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L0, abs_x, abs_y, cu_width, cu_width, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]);
-      if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L1, abs_x, abs_y, cu_width, cu_width, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]);
+      if (cur_cu->inter.mv_dir & 1) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L0, abs_x, abs_y, cu_width, cu_height, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]);
+      if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L1, abs_x, abs_y, cu_width, cu_height, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]);
 #endif
 
       goto end;
@@ -1528,7 +1492,7 @@ void uvg_encode_coding_tree(
     CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag");
   }
 
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4 && cu_height != 4)  {
 
     int8_t ctx_predmode = 0;
 
@@ -1548,11 +1512,7 @@ void uvg_encode_coding_tree(
       CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag");
     }
   }
-
-  // part_mode
-  //encode_part_mode(state, cabac, cur_cu, depth);
-
-  
+    
 
 #if ENABLE_PCM
   // Code IPCM block
@@ -1571,8 +1531,8 @@ void uvg_encode_coding_tree(
     uvg_pixel *rec_base_v = &frame->rec->v[x / 2 + y / 2 * ctrl->in.width / 2];
 
     // Luma
-    for (unsigned y_px = 0; y_px < LCU_WIDTH >> depth; y_px++) {
-      for (unsigned x_px = 0; x_px < LCU_WIDTH >> depth; x_px++) {
+    for (unsigned y_px = 0; y_px < cu_height; y_px++) {
+      for (unsigned x_px = 0; x_px < cu_width; x_px++) {
         uvg_bitstream_put(cabac->stream, base_y[x_px + y_px * ctrl->in.width], 8);
         rec_base_y[x_px + y_px * ctrl->in.width] = base_y[x_px + y_px * ctrl->in.width];
       }
@@ -1580,14 +1540,14 @@ void uvg_encode_coding_tree(
 
     // Chroma
     if (ctrl->chroma_format != UVG_CSP_400) {
-      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
-        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+      for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
+        for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
           uvg_bitstream_put(cabac->stream, base_u[x_px + y_px * (ctrl->in.width >> 1)], 8);
           rec_base_u[x_px + y_px * (ctrl->in.width >> 1)] = base_u[x_px + y_px * (ctrl->in.width >> 1)];
         }
       }
-      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
-        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+      for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
+        for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
           uvg_bitstream_put(cabac->stream, base_v[x_px + y_px * (ctrl->in.width >> 1)], 8);
           rec_base_v[x_px + y_px * (ctrl->in.width >> 1)] = base_v[x_px + y_px * (ctrl->in.width >> 1)];
         }
@@ -1599,21 +1559,15 @@ void uvg_encode_coding_tree(
 
   if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     uint8_t imv_mode = UVG_IMV_OFF;
-    
-    const int num_pu = uvg_part_mode_num_parts[cur_cu->part_size];
     bool non_zero_mvd = false;
+  
+    // TODO: height for non-square blocks
+    const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y);
 
-    for (int i = 0; i < num_pu; ++i) {
-      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
-      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
-      const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
-      const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
-      const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, pu_x, pu_y);
-
-      non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL);
-      DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
-      uvg_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu);
-    }
+    non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, NULL, NULL, cu_loc);
+    DBG_PRINT_MV(state, x, y, cu_width, cu_height, cur_pu);
+    uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_pu);
+    
 
     // imv mode, select between fullpel, half-pel and quarter-pel resolutions
     // 0 = off, 1 = fullpel, 2 = 4-pel, 3 = half-pel
@@ -1631,54 +1585,80 @@ void uvg_encode_coding_tree(
     }
 
     {
-      int cbf = cbf_is_set_any(cur_cu->cbf, depth);
       // Only need to signal coded block flag if not skipped or merged
       // skip = no coded residual, merge = coded residual
-      if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) {
+      const bool has_coeffs = cur_pu->root_cbf || cur_pu->cbf;
+      if (!cur_cu->merged) {
         cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model);
-        CABAC_BIN(cabac, cbf, "rqt_root_cbf");
+        CABAC_BIN(cabac, has_coeffs, "rqt_root_cbf");
       }
       // Code (possible) coeffs to bitstream
-
-      if (cbf) {
-        encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type);
+      if (has_coeffs) {
+        int luma_cbf_ctx = 0;
+        encode_transform_coeff(state, cu_loc, 0, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, cu_loc, cu_loc);
       }
 
-      encode_mts_idx(state, cabac, cur_cu);
+      encode_mts_idx(state, cabac, cur_cu, cu_loc);
 
     }
   } else if (cur_cu->type == CU_INTRA) {
     if(tree_type != UVG_CHROMA_T) {
-      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL);
+      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, NULL, NULL);
     }
+    
+    const bool is_local_dual_tree = (chroma_loc->width != cu_loc->width || chroma_loc->height != cu_loc->height);
 
     // Code chroma prediction mode.
-    if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4 && tree_type == UVG_BOTH_T) {
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
+    if (state->encoder_control->chroma_format != UVG_CSP_400 
+      && (chroma_loc->width == cu_loc->width && chroma_loc->height == cu_loc->height) 
+      && tree_type == UVG_BOTH_T) {
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, !cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0, NULL);
+    }
+    int luma_cbf_ctx = 0;
+
+    if (tree_type != UVG_CHROMA_T) {
+      // Cycle through sub partitions if ISP enabled.
+      // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
+      // Small blocks are split only twice.
+      int split_type = cur_cu->intra.isp_mode;
+      int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
+      luma_cbf_ctx = split_limit != 1 ? 2 : 0;
+      // If all first three splits have luma cbf 0, the last one must be one. Since the value ca be derived, no need to write it
+      bool can_skip_last_cbf = true;
+      for (int i = 0; i < split_limit; ++i) {
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type, true);
+
+        // Check if last split to write chroma
+        bool last_split = (i + 1) == split_limit;
+        encode_transform_coeff(state, &split_loc,
+          0, coeff, NULL, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, 
+          cu_loc, is_local_dual_tree ? NULL : chroma_loc);
+        can_skip_last_cbf &= luma_cbf_ctx == 2;
+      }
     }
 
     if (tree_type != UVG_CHROMA_T) {
-      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type);
-    }
+      encode_lfnst_idx(state, cabac, cur_cu, is_local_dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc);
 
-    if (tree_type != UVG_CHROMA_T) {
-      bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, x, y, depth, cu_width, cu_height, tree_type, COLOR_Y);
+      encode_mts_idx(state, cabac, cur_cu, cu_loc);
     }
-    encode_mts_idx(state, cabac, cur_cu);
 
     // For 4x4 the chroma PU/TU is coded after the last 
-    if (state->encoder_control->chroma_format != UVG_CSP_400 && 
-      ((depth == 4 && x % 8 && y % 8) || tree_type == UVG_CHROMA_T) &&
+    if (state->encoder_control->chroma_format != UVG_CSP_400 &&
+      ((is_local_dual_tree &&
+      has_chroma) || tree_type == UVG_CHROMA_T) &&
       tree_type != UVG_LUMA_T)   {
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL);
+      int8_t luma_dir = uvg_get_co_located_luma_mode(tree_type != UVG_CHROMA_T ? chroma_loc : cu_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir,NULL);
       // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints
-      cu_info_t* tmp = uvg_cu_array_at((cu_array_t*)used_array, x, y);
+      cu_info_t* tmp = uvg_cu_array_at((cu_array_t *)used_array, chroma_loc->x, chroma_loc->y);
       tmp->violates_lfnst_constrained_luma = false;
       tmp->violates_lfnst_constrained_chroma = false;
       tmp->lfnst_last_scan_pos = false;
-      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff, tree_type);
+      encode_transform_coeff(state, chroma_loc, 1, coeff, NULL, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc);
       // Write LFNST only once for single tree structure
-      encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV);
+      encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, chroma_loc);
     }
   }
 
@@ -1688,13 +1668,13 @@ void uvg_encode_coding_tree(
     exit(1);
   }
   if (state->encoder_control->cabac_debug_file) {
-    fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %d %d", x << (tree_type == UVG_CHROMA_T), y << (tree_type == UVG_CHROMA_T), depth, tree_type);
+    fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %9d %d", x, y, split_tree.split_tree, tree_type);
     fwrite(&cabac->ctx, 1, sizeof(cabac->ctx), state->encoder_control->cabac_debug_file);
   }
 
 end:
 
-  if (is_last_cu_in_qg(state, x, y, depth)) {
+  if (is_last_cu_in_qg(state, cu_loc)) {
     state->last_qp = cur_cu->qp;
   }
 
@@ -1703,27 +1683,31 @@ end:
 double uvg_mock_encode_coding_unit(
   encoder_state_t* const state,
   cabac_data_t* cabac,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type) {
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree) {
   double bits = 0;
   const encoder_control_t* const ctrl = state->encoder_control;
 
-  int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
-  int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
 
-  const int cu_width = LCU_WIDTH >> depth;
-  
+  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+
+  int x_local = cu_loc->local_x;
+  int y_local = cu_loc->local_y;
+  const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width;
+    
   const cu_info_t* left_cu = NULL, *above_cu = NULL;
   if (x) {
     if(x_local || tree_type != UVG_CHROMA_T) {
       left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
     }
     else {
-      left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1);
+      left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x  - 1, y);
     }
   }
   if (y) {
@@ -1731,7 +1715,7 @@ double uvg_mock_encode_coding_unit(
       above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1);
     }
     else {
-      above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1);
+      above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x, y - 1);
     }
   }
   
@@ -1740,23 +1724,23 @@ double uvg_mock_encode_coding_unit(
   }
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
-  if (tree_type != UVG_CHROMA_T ? depth != MAX_DEPTH : depth != MAX_DEPTH - 1) {
+  if (cur_cu->log2_height + cur_cu->log2_width > 4) {
+    // We do not care about whether the split is implicit or not since there is never split here
+    bool is_implicit;
     uvg_write_split_flag(
       state,
       cabac,
       left_cu,
       above_cu,
-      0,
-      depth,
-      cu_width >> (tree_type == UVG_CHROMA_T),
-      x >> (tree_type == UVG_CHROMA_T),
-      y >> (tree_type == UVG_CHROMA_T),
-      tree_type,
-      &bits);
+      cu_loc,
+      split_tree,
+      tree_type, &is_implicit,
+      &bits
+      );
   }
 
   // Encode skip flag
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
     int8_t ctx_skip = 0;
 
     if (left_cu && left_cu->skipped) {
@@ -1789,7 +1773,7 @@ double uvg_mock_encode_coding_unit(
     }
   }
   // Prediction mode
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
 
     int8_t ctx_predmode = 0;
 
@@ -1802,7 +1786,7 @@ double uvg_mock_encode_coding_unit(
   
   if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     const uint8_t imv_mode = UVG_IMV_OFF;
-    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits);
+    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, lcu, &bits, cu_loc);
     if (ctrl->cfg.amvr && non_zero_mvd) {
       CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag");
       if (imv_mode > UVG_IMV_OFF) {
@@ -1815,10 +1799,13 @@ double uvg_mock_encode_coding_unit(
   }
   else if (cur_cu->type == CU_INTRA) {
     if(tree_type != UVG_CHROMA_T) {
-      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits);
+      uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits);
     }
-    if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits);
+    if((chroma_loc || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+      int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc,cu_loc , cur_cu, tree_type != UVG_CHROMA_T ? lcu : NULL,
+              tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
+              is_separate_tree ? UVG_CHROMA_T : tree_type);
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, chroma_loc, cur_cu, tree_type), luma_dir, &bits);
     }
   }
   else {
@@ -1872,3 +1859,27 @@ void uvg_encode_mvd(encoder_state_t * const state,
 
   if(bits_out) *bits_out = temp_bits_out;
 }
+
+
+/**
+ * \brief Get a subset of LCU coeff array.
+ *
+ * \param dst         Destination array. Should be coeff_t [32*32].
+ * \param src         Coeff LCU array.
+ * \param lcu_x       Local LCU x coordinate.
+ * \param lcu_y       Local LCU y coordinate.
+ * \param width       Block width.
+ * \param height      Block height.
+ * \param lcu_width   LCU_WIDTH for luma, LCU_WIDTH_C for chroma.
+ *
+ */
+void uvg_get_sub_coeff(const coeff_t *dst, const coeff_t * const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int lcu_width)
+{
+  // Take subset of coeff array
+  coeff_t* dst_ptr = (coeff_t*)dst;
+  const coeff_t* coeff_ptr = &src[lcu_x + lcu_y * lcu_width];
+  for (int j = 0; j < block_h; ++j) {
+    //memcpy(dst_coeff + (j * lcu_width), &coeff[j * tr_width], tr_width * sizeof(coeff_t));
+    memcpy(&dst_ptr[j * block_w], &coeff_ptr[j * lcu_width], block_w * sizeof(coeff_t));
+  }
+}
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index c2cd39da..3df702ef 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -40,30 +40,29 @@
 #include "encoderstate.h"
 #include "global.h"
 
-bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu);
+bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t*
+                        const cu_loc);
 bool uvg_is_lfnst_allowed(
   const encoder_state_t* const state,
   const cu_info_t* const pred_cu,
-  const int width,
-  const int height,
-  const int x,
-  const int y,
   enum uvg_tree_type tree_type,
   const color_t color,
-  const lcu_t* lcu);
+  const cu_loc_t* const cu_loc, const lcu_t* const lcu);
 
 void uvg_encode_coding_tree(
   encoder_state_t * const state,
-  uint16_t x_ctb,
-  uint16_t y_ctb,
-  uint8_t depth,
   lcu_coeff_t *coeff,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
+  split_tree_t split_tree,
+  bool has_chroma);
 
 void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
   const coeff_t* coeff,
   uint32_t width,
+  uint32_t height,
   uint8_t type,
   int8_t scan_mode,
   double* bits);
@@ -77,41 +76,47 @@ void uvg_encode_mvd(encoder_state_t * const state,
 double uvg_mock_encode_coding_unit(
   encoder_state_t* const state,
   cabac_data_t* cabac,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree);
 
-int uvg_encode_inter_prediction_unit(encoder_state_t* const state,
-                                      cabac_data_t* const cabac,
-                                      const cu_info_t* const cur_cu,
-                                      int x, int y, int width, int height,
-                                      int depth, 
-                                      lcu_t* lcu,
-                                      double* bits_out);
-
-void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
+int uvg_encode_inter_prediction_unit(
+  encoder_state_t* const state,
   cabac_data_t* const cabac,
   const cu_info_t* const cur_cu,
-  int x, int y, int depth, const lcu_t* lcu, double* bits_out);
+  lcu_t* lcu,
+  double* bits_out,
+  const cu_loc_t* const cu_loc);
+
+void uvg_encode_intra_luma_coding_unit(
+  const encoder_state_t* const state,
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  double* bits_out);
 
 
-bool uvg_write_split_flag(
+uint8_t uvg_write_split_flag(
   const encoder_state_t* const state,
   cabac_data_t* cabac,
   const cu_info_t* left_cu,
   const cu_info_t* above_cu,
-  uint8_t split_flag,
-  int depth,
-  int cu_width,
-  int x,
-  int y,
+  const cu_loc_t* const cu_loc,
+  split_tree_t,
   enum uvg_tree_type tree_type,
+  bool* is_implicit_out,
   double* bits_out);
 
 void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
   uint8_t lastpos_x, uint8_t lastpos_y,
   uint8_t width, uint8_t height,
   uint8_t type, uint8_t scan, double* bits_out);
+
+void uvg_get_sub_coeff(const coeff_t* dst, const coeff_t* const src, 
+                       const int lcu_x, const int lcu_y, 
+                       const int block_w, const int block_h, 
+                       const int lcu_width);
diff --git a/src/encoder.c b/src/encoder.c
index d0121037..56d03305 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -320,6 +320,13 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
     encoder->scaling_list.use_default_list = 1;
   }
 
+  if(cfg->dep_quant) {
+    if(!uvg_init_nb_info(encoder)) {
+      fprintf(stderr, "Could not initialize nb info.\n");
+      goto init_failed;      
+    }
+  }
+
   // ROI / delta QP
   if (cfg->roi.file_path) {
     const char *mode[2] = { "r", "rb" };
@@ -378,11 +385,7 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
   {
     goto init_failed;
   }
-
-  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
-  // for SMP and AMP partition units.
-  encoder->tr_depth_inter = 0;
-
+  
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                           encoder->cfg.tiles_height_count > 1;
diff --git a/src/encoder.h b/src/encoder.h
index 0fb46e1b..05750292 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -38,6 +38,7 @@
  * Initialization of encoder_control_t.
  */
 
+#include "dep_quant.h"
 #include "global.h" // IWYU pragma: keep
 #include "uvg266.h"
 #include "scalinglist.h"
@@ -98,6 +99,10 @@ typedef struct encoder_control_t
   //scaling list
   scaling_list_t scaling_list;
 
+  NbInfoSbb* m_scanId2NbInfoSbbArray[7 + 1][7 + 1];
+  NbInfoOut* m_scanId2NbInfoOutArray[7 + 1][7 + 1];
+  struct dep_quant_scan_info* scan_info[7 + 1][7 + 1];
+
   //spec: references to variables defined in Rec. ITU-T H.265 (04/2013)
   int8_t tiles_enable; /*!<spec: tiles_enabled */
 
@@ -132,8 +137,6 @@ typedef struct encoder_control_t
 
   FILE *roi_file;
 
-  int tr_depth_inter;
-
   //! pic_parameter_set
   struct {
     uint8_t dependent_slice_segments_enabled_flag;
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 832969fc..f4716c67 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -528,48 +528,31 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   WRITE_UE(stream, MIN_SIZE-2, "log2_min_luma_coding_block_size_minus2"); // Min size 2^3 = 8x8
   // if(!no_partition_constraints_override_constraint_flag)
     WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
-  WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
-  WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_luma");  
-
+  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma");
+  if (encoder->cfg.max_btt_depth[0]) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
+  }
+  
   if (encoder->chroma_format != UVG_CSP_400)
   {
     WRITE_U(stream, encoder->cfg.dual_tree, 1, "qtbtt_dual_tree_intra_flag");
   }
   if (encoder->cfg.dual_tree) {
-    WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
-    WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
-    if (0 /*sps_max_mtt_hierarchy_depth_intra_slice_chroma != 0*/) {
-      WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
-      WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
+    WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
+    if (encoder->cfg.max_btt_depth[2]) {
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
     }
   }
-  WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_inter_slice");
-  WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_inter_slice");  
-
-
-#if 0 // mtt depth intra
-  if (max_mtt_depth_intra != 0) {
-    WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_luma");
-    WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_luma");
+  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice");
+  if (encoder->cfg.max_btt_depth[1] != 0) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
   }
-#endif
-#if 0 // mtt depth inter
-  if (max_mtt_depth_inter != 0) {
-    WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_inter_tile_group");
-    WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_inter_tile_group");
-  }
-#endif
-#if 0 // Dual Tree
-  if (encoder->cfg.dual_i_tree) {
-    WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_tile_group_chroma");
-    WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_tile_group_chroma");
-
-    if (max_mtt_depth_intra != 0) {
-      WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_chroma");
-      WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_chroma");
-    }
-  }
-#endif
 
   if (LCU_WIDTH > 32)
     WRITE_U(stream, (TR_MAX_LOG2_SIZE - 5) ? 1 : 0, 1, "sps_max_luma_transform_size_64_flag");
@@ -665,7 +648,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
 
   WRITE_UE(stream, encoder->cfg.log2_parallel_merge_level-2, "log2_parallel_merge_level_minus2");
 
-  WRITE_U(stream, 0, 1, "sps_isp_enabled_flag");
+  WRITE_U(stream, encoder->cfg.isp, 1, "sps_isp_enabled_flag");
   
   if (state->encoder_control->cfg.mrl) {
     WRITE_U(stream, 1, 1, "sps_mrl_enabled_flag");
@@ -706,7 +689,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
 
   WRITE_U(stream, 0, 1, "scaling_list_enabled_flag");
 
-  WRITE_U(stream, 0, 1, "pic_dep_quant_enabled_flag");
+  WRITE_U(stream, encoder->cfg.dep_quant, 1, "pic_dep_quant_enabled_flag");
 
   WRITE_U(stream, encoder->cfg.signhide_enable, 1, "pic_sign_data_hiding_enabled_flag");
 
@@ -1142,7 +1125,7 @@ static void uvg_encoder_state_write_bitstream_picture_header(
     WRITE_U(stream, 0, 1, "ph_mvd_l1_zero_flag");
   }
 
-  if (encoder->cfg.jccr) {
+  if (encoder->cfg.jccr && encoder->chroma_format != UVG_CSP_400) {
     WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag");
   }
   // END PICTURE HEADER
@@ -1375,11 +1358,14 @@ void uvg_encoder_state_write_bitstream_slice_header(
   }
 
   // ToDo: depquant
+  if (encoder->cfg.dep_quant) {
+    WRITE_U(stream, 1, 1, "sh_dep_quant_used_flag");
+  }
 
-  if (state->encoder_control->cfg.signhide_enable) {
+  if (state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant) {
     WRITE_U(stream, 1, 1, "sh_sign_data_hiding_used_flag");
   }
-  if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable /* && !cfg.dep_quant*/)
+  if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable  && !encoder->cfg.dep_quant)
   {
     // TODO: find out what this is actually about and parametrize it
     WRITE_U(stream, 0, 1, "sh_ts_residual_coding_disabled_flag"); 
diff --git a/src/encoderstate.c b/src/encoderstate.c
index cdadccf4..78c9c9f2 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -627,43 +627,52 @@ static void encode_sao(encoder_state_t * const state,
  * \param prev_qp         -1 if QP delta has not been coded in current QG,
  *                        otherwise the QP of the current QG
  */
-static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
+static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const
+                       int depth)
 {
 
   // Stop recursion if the CU is completely outside the frame.
-  if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
+  if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;
 
-  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
-  const int cu_width = LCU_WIDTH >> depth;
+  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
+  const int width = 1 << cu->log2_width;
 
   if (depth <= state->frame->max_qp_delta_depth) {
     *prev_qp = -1;
   }
 
-  if (cu->depth > depth) {
+  if (cu_loc->width > width) {
     // Recursively process sub-CUs.
-    const int d = cu_width >> 1;
-    set_cu_qps(state, x,     y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x,     y + d, depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp, prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
 
   } else {
     bool cbf_found = *prev_qp >= 0;
 
-    if (cu->tr_depth > depth) {
+    int y_limit = cu_loc->y + cu_loc->height;
+    int x_limit = cu_loc->x + cu_loc->width;
+    if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
       // The CU is split into smaller transform units. Check whether coded
       // block flag is set for any of the TUs.
-      const int tu_width = LCU_WIDTH >> cu->tr_depth;
-      for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
-        for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
+      const int tu_width = MIN(TR_MAX_WIDTH, 1 << cu->log2_width);
+      for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
+        for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
           cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
-          if (cbf_is_set_any(tu->cbf, cu->depth)) {
+          if (cbf_is_set_any(tu->cbf)) {
             cbf_found = true;
           }
         }
       }
-    } else if (cbf_is_set_any(cu->cbf, cu->depth)) {
+    } else if (cbf_is_set_any(cu->cbf)) {
       cbf_found = true;
     }
 
@@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
     if (cbf_found) {
       *prev_qp = qp = cu->qp;
     } else {
-      qp = uvg_get_cu_ref_qp(state, x, y, *last_qp);
+      qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp);
     }
 
     // Set the correct QP for all state->tile->frame->cu_array elements in
     // the area covered by the CU.
-    for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
-      for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
+    for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) {
+      for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) {
         uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
       }
     }
 
-    if (is_last_cu_in_qg(state, x, y, depth)) {
+    if (is_last_cu_in_qg(state, cu_loc)) {
       *last_qp = cu->qp;
     }
   }
@@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
   if (state->frame->max_qp_delta_depth >= 0) {
     int last_qp = state->last_qp;
     int prev_qp = -1;
-    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
+    cu_loc_t cu_loc;
+    uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH);
+    set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0);
   }
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) {
@@ -870,10 +881,16 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
 
   enum uvg_tree_type tree_type = state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
   //Encode coding tree
-  uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff, tree_type);
+  cu_loc_t start;
+  uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+  split_tree_t split_tree = { 0, 0, 0, 0, 0 };
+
+  uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);
 
   if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-    uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, 0, lcu->coeff, UVG_CHROMA_T);
+    uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+    cu_loc_t chroma_tree_loc = start;
+    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &chroma_tree_loc, split_tree, true);
   }
 
   if (!state->cabac.only_count) {
@@ -1152,6 +1169,12 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
           uvg_threadqueue_submit(state->encoder_control->threadqueue, job[0]);
 
           uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]);
+#ifdef UVG_DEBUG_PRINT_CABAC
+          // Ensures that the ctus are encoded in raster scan order
+          if(i >= state->tile->frame->width_in_lcu) {
+            uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[(lcu->id / state->tile->frame->width_in_lcu - 1) * state->tile->frame->width_in_lcu]);
+          }
+#endif
         }
 
         uvg_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
@@ -1281,13 +1304,13 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
             sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
             sub_state->tile->frame->height_in_lcu * LCU_WIDTH
         );
-        if(main_state->encoder_control->cfg.dual_tree){
+        if(main_state->encoder_control->cfg.dual_tree && main_state->frame->is_irap){
           sub_state->tile->frame->chroma_cu_array = uvg_cu_subarray(
               main_state->tile->frame->chroma_cu_array,
-              offset_x / 2,
-              offset_y / 2,
-              sub_state->tile->frame->width_in_lcu * LCU_WIDTH_C,
-              sub_state->tile->frame->height_in_lcu * LCU_WIDTH_C
+              offset_x,
+              offset_y,
+              sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
+              sub_state->tile->frame->height_in_lcu * LCU_WIDTH
           );
         }
       }
@@ -1926,10 +1949,9 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
 
   if (cfg->dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 && state->frame->is_irap) {
     assert(state->tile->frame->chroma_cu_array == NULL);
-    state->tile->frame->chroma_cu_array = uvg_cu_array_chroma_alloc(
-      state->tile->frame->width / 2,
-      state->tile->frame->height / 2,
-      state->encoder_control->chroma_format
+    state->tile->frame->chroma_cu_array = uvg_cu_array_alloc(
+      state->tile->frame->width,
+      state->tile->frame->height
     );
   }
   // Set pictype.
@@ -2029,9 +2051,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame)
 {
 #if UVG_DEBUG_PRINT_CABAC == 1
-  uvg_cabac_bins_count = 0;
+  // uvg_cabac_bins_count = 0;
   if (state->frame->num == 0) uvg_cabac_bins_verbose = true;
-  else uvg_cabac_bins_verbose = false;
+  // else uvg_cabac_bins_verbose = false;
 #endif
 
 
@@ -2193,11 +2215,12 @@ int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
   const cu_array_t *cua = state->tile->frame->cu_array;
   // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_width);
+  const int qg_height = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_height);
 
   // Coordinates of the top-left corner of the quantization group
   const int x_qg = x & ~(qg_width - 1);
-  const int y_qg = y & ~(qg_width - 1);
+  const int y_qg = y & ~(qg_height - 1);
   if(x_qg == 0 && y_qg > 0 && y_qg % LCU_WIDTH == 0) {
     return uvg_cu_array_at_const(cua, x_qg, y_qg - 1)->qp;
   }
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 55d265e3..88409703 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -332,6 +332,7 @@ typedef struct encoder_state_t {
   int8_t qp;
 
   double c_lambda;
+  double chroma_weights[4];
 
   /**
    * \brief Whether a QP delta value must be coded for the current LCU.
@@ -359,7 +360,15 @@ typedef struct encoder_state_t {
   //Constraint structure  
   void * constraint;
 
+  // Since lfnst needs the collocated luma intra mode for
+  // dual tree if the chroma mode is cclm mode and getting all of
+  // the information that would be necessary to get the collocated
+  // luma mode in the lfnst functions, instead store the current
+  // collocated luma mode in the state.
+  int8_t collocated_luma_mode;
 
+  quant_block quant_blocks[3]; // luma, ISP, chroma
+  rate_estimator_t rate_estimator[4]; // luma, cb, cr, isp
 } encoder_state_t;
 
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
@@ -401,14 +410,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
  * \param depth   depth in the CU tree
  * \return true, if it's the last CU in its QG, otherwise false
  */
-static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
+static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc)
 {
   if (state->frame->max_qp_delta_depth < 0) return false;
-
-  const int cu_width = LCU_WIDTH >> depth;
+  
   const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
-  const int right  = x + cu_width;
-  const int bottom = y + cu_width;
+  const int right  = cu_loc->x + cu_loc->width;
+  const int bottom = cu_loc->y + cu_loc->height;
   return (right % qg_width == 0 || right >= state->tile->frame->width) &&
          (bottom % qg_width == 0 || bottom >= state->tile->frame->height);
 }
diff --git a/src/filter.c b/src/filter.c
index 2d51a17c..a55dc619 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -36,6 +36,7 @@
 
 #include "cu.h"
 #include "encoder.h"
+#include "intra.h"
 #include "uvg266.h"
 #include "transform.h"
 #include "videoframe.h"
@@ -269,19 +270,19 @@ static bool is_tu_boundary(
   int32_t x,
   int32_t y,
   edge_dir dir,
+  color_t color,
   enum uvg_tree_type tree_type)
 {
-  x >>= tree_type == UVG_CHROMA_T;
-  y >>= tree_type == UVG_CHROMA_T;
   // if (x & 3 || y & 3) return false;
   const cu_info_t *const scu =
     uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y);
-  const int tu_width = LCU_WIDTH >> (scu->tr_depth + (tree_type == UVG_CHROMA_T));
 
   if (dir == EDGE_HOR) {
-    return (y & (tu_width - 1)) == 0;
+    return color == COLOR_Y ? scu->luma_deblocking & EDGE_HOR :
+                              scu->chroma_deblocking & EDGE_HOR;
   } else {
-    return (x & (tu_width - 1)) == 0;
+    return color == COLOR_Y ? scu->luma_deblocking & EDGE_VER :
+                              scu->chroma_deblocking & EDGE_VER;
   }
 }
 
@@ -306,32 +307,6 @@ static bool is_pu_boundary(const encoder_state_t *const state,
    it for now, in case some other tool requires it.
   */
   return false;
-  //const cu_info_t *const scu =
-  //  uvg_cu_array_at_const(state->tile->frame->cu_array, x, y);
-  //// Get the containing CU.
-  //const int32_t cu_width = LCU_WIDTH >> scu->depth;
-  //const int32_t x_cu = x & ~(cu_width - 1);
-  //const int32_t y_cu = y & ~(cu_width - 1);
-  //const cu_info_t *const cu =
-  //  uvg_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu);
-
-  //const int num_pu = uvg_part_mode_num_parts[cu->part_size];
-  //for (int i = 0; i < num_pu; i++) {
-  //  if (dir == EDGE_HOR) {
-  //    int y_pu = PU_GET_Y(cu->part_size, cu_width, y_cu, i);
-  //    if (y_pu == y) {
-  //      return true;
-  //    }
-
-  //  } else {
-  //    int x_pu = PU_GET_X(cu->part_size, cu_width, x_cu, i);
-  //    if (x_pu == x) {
-  //      return true;
-  //    }
-  //  }
-  //}
-
-  //return false;
 }
 
 
@@ -346,9 +321,9 @@ static bool is_pu_boundary(const encoder_state_t *const state,
 static bool is_on_8x8_grid(int x, int y, edge_dir dir)
 {
   if (dir == EDGE_HOR) {
-    return (y & 7) == 0 && (x & 2) == 0;
+    return (y & 7) == 0;
   } else {
-    return (x & 7) == 0 && (y & 2) == 0;
+    return (x & 7) == 0;
   }
 }
 
@@ -628,10 +603,10 @@ static INLINE void get_max_filter_length(uint8_t *filt_len_P, uint8_t *filt_len_
   bool transform_edge_4x4[2] = { false, false };
   bool transform_edge_8x8[2] = { false, false };
   
-  if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, tree_type);
-  if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, tree_type);
-  if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, tree_type);
-  if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, tree_type);
+  if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, comp, tree_type);
+  if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, comp, tree_type);
+  if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, comp, tree_type);
+  if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, comp, tree_type);
 
   if (comp == COLOR_Y) {
     if (tu_size_P_side <= 4 || tu_size_Q_side <= 4){
@@ -756,8 +731,8 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
           cu_q = uvg_cu_array_at(frame->cu_array, x_coord, y);
         }
 
-        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
-          || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y);
+        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, COLOR_Y)
+          || cbf_is_set(cu_p->cbf, COLOR_Y);
 
         // Filter strength
         strength = 0;
@@ -766,7 +741,6 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
         }
         else if (tu_boundary && nonzero_coeffs) {
           // Non-zero residual/coeffs and transform boundary
-          // Neither CU is intra so tr_depth <= MAX_DEPTH.
           strength = 1;
         }
         else if(cu_p->inter.mv_dir == 3 || cu_q->inter.mv_dir == 3 || state->frame->slicetype == UVG_SLICE_B) { // B-slice related checks. TODO: Need to account for cu_p being in another slice?
@@ -854,18 +828,50 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
       bool is_side_Q_large = false;
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
-      const int cu_size = LCU_WIDTH >> cu_q->depth;
-      const int pu_part_idx = (y + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? 
-                               1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
-                            + (x + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
-      const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
-                                          : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
-      const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) 
-                                         : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
+
+      const int cu_width = 1 << cu_q->log2_width;
+      const int cu_height = 1 << cu_q->log2_height;
+      const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
+      const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+      int tu_size_q_side = 0;
+      if (cu_q->type == CU_INTRA && cu_q->intra.isp_mode != ISP_MODE_NO_ISP) {
+        if (cu_q->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
+          tu_size_q_side = MAX(4, cu_width >> 2);
+        } else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_q_side = MAX(4,  cu_height >> 2);
+        } else {
+          tu_size_q_side = dir == EDGE_HOR ?
+                             MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
+                             MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+        }
+      } else {
+        tu_size_q_side = dir == EDGE_HOR ?
+                           MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
+                           MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+      }
+
+      int tu_size_p_side = 0;
+      if (cu_p->type == CU_INTRA && cu_p->intra.isp_mode != ISP_MODE_NO_ISP) {
+        if (cu_p->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_width) >> 2);
+        } else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2);
+        } else {
+          tu_size_p_side = dir == EDGE_HOR ?
+                             MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
+                             MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+        }
+      } else {
+        tu_size_p_side = dir == EDGE_HOR ?
+                           MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
+                           MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+        
+      }
+
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
                             dir, tu_boundary,
-                            LCU_WIDTH >> cu_p->tr_depth,
-                            LCU_WIDTH >> cu_q->tr_depth,
+                            tu_size_p_side,
+                            tu_size_q_side,
                             pu_pos, pu_size, cu_q->merged, COLOR_Y,
                             UVG_LUMA_T);
 
@@ -1073,41 +1079,44 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
       // CUs on both sides of the edge
       cu_info_t *cu_p;
       cu_info_t *cu_q;
-      int32_t x_coord = x << (tree_type != UVG_CHROMA_T);
-      int32_t y_coord = y << (tree_type != UVG_CHROMA_T);
+      int32_t x_coord = x << 1;
+      int32_t y_coord = y << 1;
       cu_array_t* cua = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
       if (dir == EDGE_VER) {
-        y_coord = (y + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
+        y_coord = (y + min_chroma_length * blk_idx) << (1);
         cu_p = uvg_cu_array_at(cua, x_coord - 1, y_coord);
         cu_q = uvg_cu_array_at(cua, x_coord    , y_coord);
 
       } else {
-        x_coord = (x + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
+        x_coord = (x + min_chroma_length * blk_idx) << (1);
         cu_p = uvg_cu_array_at(cua, x_coord, y_coord - 1);
         cu_q = uvg_cu_array_at(cua, x_coord, y_coord    );
       }
-
-      const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T));
-      const int pu_part_idx = ((y << (tree_type != UVG_CHROMA_T)) + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ?
-                               1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
-                              + ((x << (tree_type != UVG_CHROMA_T)) + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
-      const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
-                                          : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
-      const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx)
-                                         : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
+      
       uint8_t max_filter_length_P = 0;
       uint8_t max_filter_length_Q = 0;
       
-      const int tu_p_size = LCU_WIDTH >> (cu_p->tr_depth + (chroma_shift));
-      const int tu_q_size = LCU_WIDTH >> (cu_q->tr_depth + (chroma_shift));
+      const int cu_width = 1 << (cu_q->log2_chroma_width );
+      const int cu_height = 1 << (cu_q->log2_chroma_height);
+      const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
+      const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+
+
+      const int tu_size_p_side = dir == EDGE_HOR ? 
+        MIN(1 << (cu_p->log2_chroma_height), TR_MAX_WIDTH) :
+        MIN(1 << (cu_p->log2_chroma_width), TR_MAX_WIDTH);
+      const int tu_size_q_side = dir == EDGE_HOR ?
+        MIN(1 << (cu_q->log2_chroma_height ), TR_MAX_WIDTH) :
+        MIN(1 << (cu_q->log2_chroma_width ), TR_MAX_WIDTH);
+
       get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
-                            dir, tu_boundary, tu_p_size, tu_q_size,
+                            dir, tu_boundary, tu_size_p_side, tu_size_q_side,
                             pu_pos, pu_size, cu_q->merged, COLOR_U,
                             tree_type);
 
 
       const bool large_boundary = (max_filter_length_P >= 3 && max_filter_length_Q >= 3);
-      const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % (LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) == 0);
+      const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % LCU_WIDTH == 0);
       uint8_t c_strength[2] = { 0, 0 };
       
 
@@ -1116,10 +1125,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
         c_strength[1] = 2;
       }
       else if (tu_boundary){ //TODO: Add ciip/IBC related stuff
-        bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_U)
-                                || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_U);
-        bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_V)
-                                || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_V);
+        bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, COLOR_U)
+                                || cbf_is_set(cu_p->cbf, COLOR_U);
+        bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, COLOR_V)
+                                || cbf_is_set(cu_p->cbf, COLOR_V);
         c_strength[0] = nonzero_coeffs_U ? 1 : 0;
         c_strength[1] = nonzero_coeffs_V ? 1 : 0;
       }
@@ -1237,11 +1246,12 @@ static void filter_deblock_unit(
   // Chroma pixel coordinates.
   const int32_t x_c = x >> 1;
   const int32_t y_c = y >> 1;
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && 
-    (is_on_8x8_grid(x_c, y_c, dir && (x_c + 4) % 32)
-     || (x == state->tile->frame->width - 8 && dir == 1 && y_c % 8 == 0)) 
+  if (state->encoder_control->chroma_format != UVG_CSP_400 &&
+    is_tu_boundary(state, x, y, dir, COLOR_UV, tree_type)
+    && (is_on_8x8_grid(x_c, y_c, dir == EDGE_HOR && (x_c + 4) % 32 ? EDGE_HOR : EDGE_VER)
+     || (x == state->tile->frame->width - 8 && dir == EDGE_HOR && y_c % 8 == 0)) 
     && tree_type != UVG_LUMA_T) {
-    filter_deblock_edge_chroma(state, x_c, y_c, length, dir, tu_boundary, tree_type);
+    filter_deblock_edge_chroma(state, x_c, y_c, 2, dir, tu_boundary, tree_type);
   }
 }
 
@@ -1271,11 +1281,11 @@ static void filter_deblock_lcu_inside(encoder_state_t * const state,
 
   for (int edge_y = y; edge_y < end_y; edge_y += 4) {
     for (int edge_x = x; edge_x < end_x; edge_x += 4) {
-      bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, luma_tree);
+      bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, COLOR_Y, luma_tree);
       if (tu_boundary || is_pu_boundary(state, edge_x, edge_y, dir)) {
         filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, luma_tree);
       }
-      if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, chroma_tree)) {
+      if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, COLOR_UV, chroma_tree)) {
         filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, chroma_tree);        
       }
     }
@@ -1302,7 +1312,7 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
   for (int x = x_px - 8; x < x_px; x += 4) {
     for (int y = y_px; y < end; y += 4) {
       // The top edge of the whole frame is not filtered.
-      bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, luma_tree);
+      bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, COLOR_Y, luma_tree);
       if (y > 0 && (tu_boundary || is_pu_boundary(state, x, y, EDGE_HOR))) {
         filter_deblock_edge_luma(state, x, y, 4, EDGE_HOR, tu_boundary);
       }
@@ -1313,13 +1323,15 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
   if (state->encoder_control->chroma_format != UVG_CSP_400) {
     const int x_px_c = x_px >> 1;
     const int y_px_c = y_px >> 1;
-    const int x_c = x_px_c - 4;
-    const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
-    for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
-      // The top edge of the whole frame is not filtered.
-      bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, chroma_tree);
-      if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) {
-        filter_deblock_edge_chroma(state, x_c , y_c, 4, EDGE_HOR, tu_boundary, chroma_tree);
+    int x_c = x_px_c - 4;
+    const int end_c_y = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
+    for(; x_c < x_px_c; x_c += 2) {
+      for (int y_c = y_px_c; y_c < end_c_y; y_c += 8) {
+        // The top edge of the whole frame is not filtered.
+        bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, COLOR_UV, chroma_tree);
+        if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) {
+          filter_deblock_edge_chroma(state, x_c , y_c, 2, EDGE_HOR, tu_boundary, chroma_tree);
+        }
       }
     }
   }
diff --git a/src/filter.h b/src/filter.h
index 0d98eedd..2db9c871 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -46,8 +46,8 @@
  * \brief Edge direction.
  */
 typedef enum edge_dir {
-  EDGE_VER = 0, // vertical
-  EDGE_HOR = 1, // horizontal
+  EDGE_VER = 1, // vertical
+  EDGE_HOR = 2, // horizontal
 } edge_dir;
 
 
diff --git a/src/global.h b/src/global.h
index 65ca2fa9..972b7e82 100644
--- a/src/global.h
+++ b/src/global.h
@@ -145,11 +145,11 @@ typedef int32_t mv_t;
 
 #define INTERNAL_MV_PREC 4 // Internal motion vector precision, 4 = 1/16 pel
 
-//! Limits for prediction block sizes. 0 = 64x64, 4 = 4x4.
+//! Limits for prediction block sizes. 
 #define PU_DEPTH_INTER_MIN 0
-#define PU_DEPTH_INTER_MAX 3
+#define PU_DEPTH_INTER_MAX 8
 #define PU_DEPTH_INTRA_MIN 0
-#define PU_DEPTH_INTRA_MAX 4
+#define PU_DEPTH_INTRA_MAX 8
 
 //! Maximum number of layers in GOP structure (for allocating structures)
 #define MAX_GOP_LAYERS 6
@@ -273,7 +273,6 @@ typedef int32_t mv_t;
 #define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
 #define CLIP_TO_QP(value) CLIP(0, 51, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
-#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))
 
diff --git a/src/inter.c b/src/inter.c
index 3bbef427..d275f4ea 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -375,23 +375,26 @@ static void inter_cp_with_ext_border(const uvg_pixel *ref_buf, int ref_stride,
  * \param predict_luma   Enable or disable luma prediction for this call.
  * \param predict_chroma Enable or disable chroma prediction for this call.
 */
-static unsigned inter_recon_unipred(const encoder_state_t * const state,
-                                    const uvg_picture * const ref,
-                                    int32_t pu_x,
-                                    int32_t pu_y,
-                                    int32_t pu_w,
-                                    int32_t pu_h,
-                                    int32_t out_stride_luma,
-                                    const mv_t mv_param[2],
-                                    yuv_t *yuv_px,
-                                    yuv_im_t *yuv_im,
-                                    bool predict_luma,
-                                    bool predict_chroma)
+static unsigned inter_recon_unipred(
+  const encoder_state_t * const state,
+  const uvg_picture * const ref,
+  int32_t out_stride_luma,
+  const mv_t mv_param[2],
+  yuv_t *yuv_px,
+  yuv_im_t *yuv_im,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
   vector2d_t int_mv = { mv_param[0], mv_param[1] };
 
   uvg_change_precision_vector2d(INTERNAL_MV_PREC, 0, &int_mv);
 
+  const int pu_x = cu_loc->x;
+  const int pu_y = cu_loc->y;
+  const int pu_w = cu_loc->width;
+  const int pu_h = cu_loc->height;
+
   const vector2d_t int_mv_in_frame = {
     int_mv.x + pu_x + state->tile->offset_x,
     int_mv.y + pu_y + state->tile->offset_y
@@ -507,17 +510,15 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
  * \param predict_luma   Enable or disable luma prediction for this call.
  * \param predict_chroma Enable or disable chroma prediction for this call.
  */
-void uvg_inter_recon_bipred(const encoder_state_t *const state,
+void uvg_inter_recon_bipred(
+  const encoder_state_t *const state,
   const uvg_picture *ref1,
   const uvg_picture *ref2,
-  int32_t pu_x,
-  int32_t pu_y,
-  int32_t pu_w,
-  int32_t pu_h,
   mv_t mv_param[2][2],
   lcu_t *lcu,
   bool predict_luma,
-  bool predict_chroma)
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
   // Allocate maximum size arrays for interpolated and copied samples
   ALIGNED(64) uvg_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
@@ -525,6 +526,11 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
   ALIGNED(64) uvg_pixel_im im_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
   ALIGNED(64) uvg_pixel_im im_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
 
+  const int pu_x = cu_loc->x;
+  const int pu_y = cu_loc->y;
+  const int pu_w = cu_loc->width;
+  const int pu_h = cu_loc->height;
+
   yuv_t px_L0;
   px_L0.size = pu_w * pu_h;
   px_L0.y = &px_buf_L0[0];
@@ -551,10 +557,10 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
 
   // Sample blocks from both reference picture lists.
   // Flags state if the outputs were written to high-precision / interpolated sample buffers.
-  unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[0],
-                                             &px_L0, &im_L0, predict_luma, predict_chroma);
-  unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[1],
-                                             &px_L1, &im_L1, predict_luma, predict_chroma);
+  unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_w, mv_param[0], &px_L0, &im_L0, predict_luma, predict_chroma,
+                                             cu_loc);
+  unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_w, mv_param[1], &px_L1, &im_L1, predict_luma, predict_chroma,
+                                             cu_loc);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
   uvg_bipred_average(lcu, &px_L0, &px_L1, &im_L0, &im_L1,
@@ -578,19 +584,14 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
  * \param predict_luma   Enable or disable luma prediction for this call.
  * \param predict_chroma Enable or disable chroma prediction for this call.
  */
-void uvg_inter_recon_cu(const encoder_state_t * const state,
-                        lcu_t *lcu,
-                        int32_t x,
-                        int32_t y,
-                        int32_t width,
-                        bool predict_luma,
-                        bool predict_chroma)
+void uvg_inter_recon_cu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-  const int num_pu = uvg_part_mode_num_parts[cu->part_size];
-  for (int i = 0; i < num_pu; ++i) {
-    uvg_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i);
-  }
+  uvg_inter_pred_pu(state, lcu, predict_luma, predict_chroma, cu_loc);  
 }
 
 static void ibc_recon_cu(const encoder_state_t * const state,
@@ -599,8 +600,7 @@ static void ibc_recon_cu(const encoder_state_t * const state,
                          int32_t y,
                          int32_t width,
                          bool predict_luma,
-                         bool predict_chroma,
-                         int i_pu)
+                         bool predict_chroma)
 {
   const int x_scu    = SUB_SCU(x);
   const int y_scu    = SUB_SCU(y);
@@ -668,79 +668,63 @@ static void ibc_recon_cu(const encoder_state_t * const state,
  * \param predict_chroma Enable or disable chroma prediction for this call.
  * \param i_pu           Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP.
  */
-void uvg_inter_pred_pu(const encoder_state_t * const state,
-                       lcu_t *lcu,
-                       int32_t x,
-                       int32_t y,
-                       int32_t width,
-                       bool predict_luma,
-                       bool predict_chroma,
-                       int i_pu)
+void uvg_inter_pred_pu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 
 {
-  const int x_scu = SUB_SCU(x);
-  const int y_scu = SUB_SCU(y);
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
-  const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
-  const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
-  const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
-  const int pu_h = PU_GET_H(cu->part_size, width, i_pu);
-  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+  const int x_scu = SUB_SCU(cu_loc->x);
+  const int y_scu = SUB_SCU(cu_loc->y);
+  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
 
-  if (cu->type == CU_IBC) {
-    ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu);
-  } else {
+  if (pu->inter.mv_dir == 3) {
+    const uvg_picture *const refs[2] = {
+      state->frame->ref->images[
+        state->frame->ref_LX[0][
+          pu->inter.mv_ref[0]]],
+      state->frame->ref->images[
+        state->frame->ref_LX[1][
+          pu->inter.mv_ref[1]]],
+    };
+    uvg_inter_recon_bipred(state,
+                           refs[0], refs[1],
+                           pu->inter.mv, lcu,
+                           predict_luma, predict_chroma,
+                           cu_loc);
+  }
+  else if (pu->type == CU_IBC) {
+    ibc_recon_cu(state, lcu, cu_loc->x, cu_loc->y, cu_loc->width, predict_luma, predict_chroma);
+  } else{
+    const int mv_idx = pu->inter.mv_dir - 1;
+    const uvg_picture *const ref =
+      state->frame->ref->images[
+        state->frame->ref_LX[mv_idx][
+          pu->inter.mv_ref[mv_idx]]];
 
-    if (pu->inter.mv_dir == 3) {
-      const uvg_picture * const refs[2] = {
-        state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]],
-        state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]],
-      };
-      uvg_inter_recon_bipred(
-        state,
-        refs[0],
-        refs[1],
-        pu_x,
-        pu_y,
-        pu_w,
-        pu_h,
-        pu->inter.mv,
-        lcu,
-        predict_luma,
-        predict_chroma);
-    } else {
-      const int                 mv_idx = pu->inter.mv_dir - 1;
-      const uvg_picture * const ref = 
-        state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]];
+    const unsigned offset_luma = SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x);
+    const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2;
+    yuv_t lcu_adapter;
+    lcu_adapter.size = cu_loc->width * cu_loc->height;
+    lcu_adapter.y = lcu->rec.y + offset_luma;
+    lcu_adapter.u = lcu->rec.u + offset_chroma;
+    lcu_adapter.v = lcu->rec.v + offset_chroma;
 
-      const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
-      const unsigned offset_chroma =
-        SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
-      yuv_t lcu_adapter;
-      lcu_adapter.size = pu_w * pu_h;
-      lcu_adapter.y    = lcu->rec.y + offset_luma,
-      lcu_adapter.u    = lcu->rec.u + offset_chroma,
-      lcu_adapter.v    = lcu->rec.v + offset_chroma,
-
-      inter_recon_unipred(
-        state,
-        ref,
-        pu_x,
-        pu_y,
-        pu_w,
-        pu_h,
-        LCU_WIDTH,
-        pu->inter.mv[mv_idx],
-        &lcu_adapter,
-        NULL,
-        predict_luma,
-        predict_chroma);
-    }
+    inter_recon_unipred(state,
+                        ref,
+                        LCU_WIDTH, pu->inter.mv[mv_idx],
+                        &lcu_adapter,
+                        NULL,
+                        predict_luma,
+                        predict_chroma,
+                        cu_loc);
   }
   if (predict_chroma && state->encoder_control->cfg.jccr) {
     const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
-    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
-    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
 
@@ -915,14 +899,12 @@ static bool is_b0_cand_coded(int x, int y, int width, int height)
  * \param ref_idx   index in the reference list
  * \param cand_out  will be filled with C0 and C1 candidates
  */
-static void get_temporal_merge_candidates(const encoder_state_t * const state,
-                                          int32_t x,
-                                          int32_t y,
-                                          int32_t width,
-                                          int32_t height,
-                                          uint8_t ref_list,
-                                          uint8_t ref_idx,
-                                          merge_candidates_t *cand_out)
+static void get_temporal_merge_candidates(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  uint8_t ref_list,
+  uint8_t ref_idx,
+  merge_candidates_t *cand_out)
 {
   /*
   Predictor block locations
@@ -951,8 +933,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
     cu_array_t *ref_cu_array = state->frame->ref->cu_arrays[colocated_ref];
     int cu_per_width = ref_cu_array->width / SCU_WIDTH;
 
-    int32_t xColBr = x + width;
-    int32_t yColBr = y + height;
+    int32_t xColBr = cu_loc->x + cu_loc->width;
+    int32_t yColBr = cu_loc->y + cu_loc->height;
 
     // C0 must be available
     if (xColBr < state->encoder_control->in.width &&
@@ -972,8 +954,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
         }
       }
     }
-    int32_t xColCtr = x + (width / 2);
-    int32_t yColCtr = y + (height / 2);
+    int32_t xColCtr = cu_loc->x + (cu_loc->width / 2);
+    int32_t yColCtr = cu_loc->y + (cu_loc->height / 2);
 
     // C1 must be inside the LCU, in the center position of current CU
     if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) {
@@ -1254,10 +1236,7 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state,
  * \param lcu             current LCU
  * \param cand_out        will be filled with A and B candidates
  */
-static void get_spatial_merge_candidates(int32_t x,
-                                         int32_t y,
-                                         int32_t width,
-                                         int32_t height,
+static void get_spatial_merge_candidates(const cu_loc_t* const cu_loc,
                                          int32_t picture_width,
                                          int32_t picture_height,
                                          lcu_t *lcu,
@@ -1276,8 +1255,13 @@ static void get_spatial_merge_candidates(int32_t x,
   |A1|_________|
   |A0|
   */
-  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
-  int32_t y_local = SUB_SCU(y);
+  const int32_t x_local = SUB_SCU(cu_loc->x); //!< coordinates from top-left of this LCU
+  const int32_t y_local = SUB_SCU(cu_loc->y);
+
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
   // A0 and A1 availability testing
   if (x != 0) {
     cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
@@ -1350,15 +1334,13 @@ static void get_spatial_merge_candidates(int32_t x,
  * \param picture_height  tile height in pixels
  * \param cand_out        will be filled with A and B candidates
  */
-static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
-                                             int32_t x,
-                                             int32_t y,
-                                             int32_t width,
-                                             int32_t height,
-                                             int32_t picture_width,
-                                             int32_t picture_height,
-                                             merge_candidates_t *cand_out,
-                                             bool wpp)
+static void get_spatial_merge_candidates_cua(
+  const cu_array_t *cua,
+  int32_t picture_width,
+  int32_t picture_height,
+  merge_candidates_t *cand_out,
+  bool wpp,
+  const cu_loc_t* const cu_loc)
 {
   /*
   Predictor block locations
@@ -1370,8 +1352,12 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
   |A1|_________|
   |A0|
   */
-  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
-  int32_t y_local = SUB_SCU(y);
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+  const int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
+  const int32_t y_local = SUB_SCU(y);
   // A0 and A1 availability testing
   if (x != 0) {
     const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1);
@@ -1484,15 +1470,13 @@ static bool add_temporal_candidate(const encoder_state_t *state,
 /**
  * \brief Pick two mv candidates from the spatial and temporal candidates.
  */
-static void get_mv_cand_from_candidates(const encoder_state_t * const state,
-                                        int32_t x,
-                                        int32_t y,
-                                        int32_t width,
-                                        int32_t height,
-                                        const merge_candidates_t *merge_cand,
-                                        const cu_info_t * const cur_cu,
-                                        int8_t reflist,
-                                        mv_t mv_cand[2][2])
+static void get_mv_cand_from_candidates(
+  const encoder_state_t * const state,
+  const merge_candidates_t *merge_cand,
+  const cu_info_t * const cur_cu,
+  int8_t reflist,
+  mv_t mv_cand[2][2],
+  int ctu_row)
 {
   const cu_info_t *const *a = merge_cand->a;
   const cu_info_t *const *b = merge_cand->b;
@@ -1552,7 +1536,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
 
   if (candidates < AMVP_MAX_NUM_CANDS)
   {
-    const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
     const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
     int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];
     for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) {
@@ -1595,32 +1578,30 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
  * \param lcu       current LCU
  * \param reflist   reflist index (either 0 or 1)
  */
-void uvg_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
-                           mv_t mv_cand[2][2],
-                           const cu_info_t  * const cur_cu,
-                           lcu_t *lcu,
-                           int8_t reflist)
+void uvg_inter_get_mv_cand(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t  * const cur_cu,
+  lcu_t *lcu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc)
 {
   merge_candidates_t merge_cand = { 0 };
   const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
   if (cur_cu->type == CU_IBC) {
     mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
     memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
     memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);
-  } else {
-    get_spatial_merge_candidates(x, y, width, height,
-                                 state->tile->frame->width,
-                                 state->tile->frame->height,
-                                 lcu,
-                                 &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp);
-    get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
-    get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+  } else { 
+    get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
+                                 &merge_cand,
+                                 parallel_merge_level,
+                                 state->encoder_control->cfg.wpp);
+    get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
+    get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
   }
+    
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
 }
@@ -1637,31 +1618,29 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
  * \param cur_cu    current CU
  * \param reflist   reflist index (either 0 or 1)
  */
-void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
-                               int32_t x,
-                               int32_t y,
-                               int32_t width,
-                               int32_t height,
-                               mv_t mv_cand[2][2],
-                               const cu_info_t* cur_cu,
-                               int8_t reflist)
+void uvg_inter_get_mv_cand_cua(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t* cur_cu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc)
 {
   merge_candidates_t merge_cand = { 0 };
 
   const cu_array_t *cua = state->tile->frame->cu_array;
   if (cur_cu->type == CU_IBC) {
     mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu, NULL,cua,cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
     memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
     memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);    
   } else {
     get_spatial_merge_candidates_cua(cua,
-                                     x, y, width, height,
-                                     state->tile->frame->width, state->tile->frame->height,
-                                     &merge_cand, state->encoder_control->cfg.wpp);
-    get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
-    get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+                                     state->tile->frame->width, state->tile->frame->height, &merge_cand, state->encoder_control->cfg.wpp,
+                                     cu_loc);
+    get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
+    get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
   }
+
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
   uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
 }
@@ -1885,23 +1864,23 @@ void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv) {
  * \param lcu       lcu containing the block
  * \return          number of merge candidates
  */
-uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
-                                 int32_t x, int32_t y,
-                                 int32_t width, int32_t height,
-                                 bool use_a1, bool use_b1,
-                                 inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
-                                 lcu_t *lcu)
+uint8_t uvg_inter_get_merge_cand(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
+  lcu_t *lcu)
 {
   uint8_t candidates = 0;
   int8_t zero_idx = 0;
   const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
   merge_candidates_t merge_cand = { 0 };
   const uint8_t max_num_cands = state->encoder_control->cfg.max_merge;
+  // Current CU
+  cu_info_t         *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(cu_loc->x), SUB_SCU(cu_loc->y));
 
-  cu_info_t         *cur_cu        = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
   if(cur_cu->type == CU_IBC) {
     mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
     for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) {
       mv_cand[i].dir = 1;
       mv_cand[i].mv[0][0] = ibc_mv_cand[i][0];
@@ -1909,18 +1888,16 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
     }
     return IBC_MRG_MAX_NUM_CANDS;
   }
-
-  get_spatial_merge_candidates(x, y, width, height,
-                               state->tile->frame->width,
-                               state->tile->frame->height,
-                               lcu,
-                               &merge_cand, parallel_merge_level, state->encoder_control->cfg.wpp);
+  get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
+                               &merge_cand,
+                               parallel_merge_level,
+                               state->encoder_control->cfg.wpp);
 
   const cu_info_t **a = merge_cand.a;
   const cu_info_t **b = merge_cand.b;
 
-  if (!use_a1) a[1] = NULL;
-  if (!use_b1) b[1] = NULL;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
 
   if (different_mer(x, y, x, y - 1, parallel_merge_level) && add_merge_candidate(b[1], NULL, NULL, &mv_cand[candidates])) candidates++;
   if (different_mer(x, y, x - 1, y, parallel_merge_level) && add_merge_candidate(a[1], b[1], NULL, &mv_cand[candidates])) candidates++;
@@ -1941,7 +1918,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
     for (int reflist = 0; reflist <= max_reflist; reflist++) {
       // Fetch temporal candidates for the current CU
       // ToDo: change collocated_from_l0_flag to allow L1 ref
-      get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
+      get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
       // TODO: enable L1 TMVP candidate
       // get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand);
 
@@ -1973,7 +1950,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
   if (candidates == max_num_cands) return candidates;
 
   if (candidates != max_num_cands - 1) {
-    const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
+    const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH);
     const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
     int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];
 
diff --git a/src/inter.h b/src/inter.h
index 45f5e5ea..4d5fccd5 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -58,61 +58,51 @@ void uvg_change_precision_vector2d(int src, int dst, vector2d_t* mv);
 void uvg_round_precision(int src, int dst, mv_t* hor, mv_t* ver);
 void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv);
 
-void uvg_inter_recon_cu(const encoder_state_t * const state,
-                        lcu_t *lcu,
-                        int32_t x,
-                        int32_t y,
-                        int32_t width,
-                        bool predict_luma,
-                        bool predict_chroma);
-
-void uvg_inter_pred_pu(const encoder_state_t * const state,
+void uvg_inter_recon_cu(
+  const encoder_state_t * const state,
   lcu_t *lcu,
-  int32_t x,
-  int32_t y,
-  int32_t width,
   bool predict_luma,
   bool predict_chroma,
-  int i_pu);
+  const cu_loc_t* const cu_loc);
+
+void uvg_inter_pred_pu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc);
 
 void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu);
 
-void uvg_inter_recon_bipred(const encoder_state_t * const state,
-                            const uvg_picture * ref1,
-                            const uvg_picture * ref2,
-                            int32_t xpos,
-                            int32_t ypos,
-                            int32_t width,
-                            int32_t height,
-                            mv_t mv_param[2][2],
-                            lcu_t* lcu,
-                            bool predict_luma,
-                            bool predict_chroma);
+void uvg_inter_recon_bipred(
+  const encoder_state_t * const state,
+  const uvg_picture * ref1,
+  const uvg_picture * ref2,
+  mv_t mv_param[2][2],
+  lcu_t* lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc);
 
 
-void uvg_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
-                           mv_t mv_cand[2][2],
-                           const cu_info_t* cur_cu,
-                           lcu_t *lcu,
-                           int8_t reflist);
+void uvg_inter_get_mv_cand(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t* cur_cu,
+  lcu_t *lcu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc);
 
-void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
-                               int32_t x,
-                               int32_t y,
-                               int32_t width,
-                               int32_t height,
-                               mv_t mv_cand[2][2],
-                               const cu_info_t* cur_cu,
-                               int8_t reflist);
+void uvg_inter_get_mv_cand_cua(
+  const encoder_state_t * const state,
+  mv_t mv_cand[2][2],
+  const cu_info_t* cur_cu,
+  int8_t reflist,
+  const cu_loc_t* const cu_loc);
 
-uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
-                                 int32_t x, int32_t y,
-                                 int32_t width, int32_t height,
-                                 bool use_a1, bool use_b1,
-                                 inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
-                                 lcu_t *lcu);
+uint8_t uvg_inter_get_merge_cand(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
+  lcu_t *lcu);
 #endif
diff --git a/src/intra.c b/src/intra.c
index 7e742d46..22eb93c7 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -37,6 +37,10 @@
 #include "image.h"
 #include "uvg_math.h"
 #include "mip_data.h"
+#include "rdo.h"
+#include "search.h"
+#include "search_intra.h"
+#include "strategies-picture.h"
 #include "strategies/strategies-intra.h"
 #include "tables.h"
 #include "transform.h"
@@ -197,6 +201,7 @@ int8_t uvg_intra_get_dir_luma_predictor(
 
 static void intra_filter_reference(
   int_fast8_t log2_width,
+  int_fast8_t log2_height,
   uvg_intra_references *refs)
 {
   if (refs->filtered_initialized) {
@@ -206,6 +211,7 @@ static void intra_filter_reference(
   }
 
   const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
+  const int_fast8_t ref_height = 2 * (1 << log2_height) + 1;
   uvg_intra_ref *ref = &refs->ref;
   uvg_intra_ref *filtered_ref = &refs->filtered_ref;
 
@@ -213,14 +219,13 @@ static void intra_filter_reference(
   filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) >> 2;
   filtered_ref->top[0] = filtered_ref->left[0];
 
-  // TODO: use block height here instead of ref_width
   // Top to bottom
-  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
+  for (int_fast8_t y = 1; y < ref_height - 1; ++y) {
     uvg_pixel *p = &ref->left[y];
     filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2;
   }
   // Bottom left (not filtered) 
-  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];
+  filtered_ref->left[ref_height - 1] = ref->left[ref_height - 1];
 
   // Left to right
   for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
@@ -231,39 +236,48 @@ static void intra_filter_reference(
   filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1];
 }
 
-
 /**
 * \brief Generate dc prediction.
-* \param log2_width    Log2 of width, range 2..5.
+* \param cu_loc        CU location and size data.
+* \param color         Color channel.
 * \param ref_top       Pointer to -1 index of above reference, length=width*2+1.
 * \param ref_left      Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 * \param multi_ref_idx Multi reference line index for use with MRL.
 */
 static void intra_pred_dc(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_pixel *const ref_top,
   const uvg_pixel *const ref_left,
   uvg_pixel *const out_block,
   const uint8_t multi_ref_idx)
 {
-  int_fast8_t width = 1 << log2_width;
-
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  
   int_fast16_t sum = 0;
-  for (int_fast8_t i = 0; i < width; ++i) {
-    sum += ref_top[i + 1 + multi_ref_idx];
-    sum += ref_left[i + 1 + multi_ref_idx];
+  // Only one loop is done for non-square blocks.
+  // In case of non-square blocks, only the longer reference is summed.
+  if (width >= height) {
+    for (int_fast8_t i = 0; i < width; ++i) {
+      sum += ref_top[i + 1 + multi_ref_idx];
+    }
+  }
+  if (width <= height) {
+    for (int_fast8_t j = 0; j < height; ++j) {
+      sum += ref_left[j + 1 + multi_ref_idx];
+    }
   }
   
   // JVET_K0122
-  // TODO: take non-square blocks into account
-  const int denom     = width << 1;
+  const int denom     = width == height ? width << 1 : MAX(width, height);
   const int divShift  = uvg_math_floor_log2(denom);
   const int divOffset = denom >> 1;
   
   const uvg_pixel dc_val = (sum + divOffset) >> divShift;
   //const uvg_pixel dc_val = (sum + width) >> (log2_width + 1);
-  const int_fast16_t block_size = 1 << (log2_width * 2);
+  const int_fast16_t block_size = width * height;
 
   for (int_fast16_t i = 0; i < block_size; ++i) {
     out_block[i] = dc_val;
@@ -271,6 +285,33 @@ static void intra_pred_dc(
 }
 
 
+bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t * const luma_loc, cu_info_t const * const cur_cu, enum
+                         uvg_tree_type tree_type)
+{
+  if (tree_type != UVG_CHROMA_T) {
+    return true;
+  }
+  uint32_t chroma_split_depth0 = GET_SPLITDATA(cur_cu, 0);
+  uint32_t chroma_split_depth1 = GET_SPLITDATA(cur_cu, 1);
+  bool allow = false;
+  if (chroma_split_depth0 == QT_SPLIT || (chroma_split_depth0 == BT_HOR_SPLIT && chroma_split_depth1 == BT_VER_SPLIT)) allow = true;
+  else if (chroma_split_depth0 == NO_SPLIT) allow = true;
+  else if (chroma_split_depth0 == BT_HOR_SPLIT && chroma_split_depth1 == NO_SPLIT) allow = true;
+  if (!allow) {
+    return false;
+  }
+  const cu_info_t* const luma_cu = uvg_cu_array_at_const(state->tile->frame->cu_array, luma_loc->x, luma_loc->y);
+  uint32_t split = GET_SPLITDATA(luma_cu, 0);
+  if (split != NO_SPLIT) {
+    allow = split == QT_SPLIT;
+  }
+  else if (split != NO_SPLIT && luma_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+    allow = false;
+  }
+  return allow;
+}
+
+
 enum lm_mode
 {
   LM_CHROMA_IDX = 81,
@@ -286,7 +327,7 @@ static void get_cclm_parameters(
   uvg_intra_ref* luma_src, uvg_intra_references*chroma_ref,
   int16_t *a, int16_t*b, int16_t*shift) {
 
-  const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);
+  const int base_unit_size = 4;
 
   // TODO: take into account YUV422
   const int unit_w = base_unit_size >> 1;
@@ -312,8 +353,8 @@ static void get_cclm_parameters(
   //int left_below_units = total_left_units - tu_height_in_units;
   //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
   //int avai_left_below_units = 0;
-  int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
-  int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);
+  int avai_above_units = y0 ? tu_width_in_units : 0;
+  int avai_left_units = x0 ? tu_height_in_units : 0;
 
   bool above_available = avai_above_units != 0;
   bool left_available = avai_left_units != 0;
@@ -491,9 +532,8 @@ static void predict_cclm(
   const lcu_t* const lcu,
   uvg_intra_references* chroma_ref,
   uvg_pixel* dst,
-  cclm_parameters_t* cclm_params,
-  enum uvg_tree_type tree_type
-  )
+  cclm_parameters_t* cclm_params
+)
 {
   assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
   assert(state->encoder_control->cfg.cclm);
@@ -511,20 +551,14 @@ static void predict_cclm(
 
   const uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
   const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
-
-  // Essentially what this does is that it uses 6-tap filtering to downsample
-  // the luma intra references down to match the resolution of the chroma channel.
-  // The luma reference is only needed when we are not on the edge of the picture.
-  // Because the reference pixels that are needed on the edge of the ctu this code
-  // is kinda messy but what can you do
-  const int ctu_size = tree_type == UVG_CHROMA_T ? LCU_WIDTH_C : LCU_WIDTH;
+  
+  const int ctu_size = LCU_WIDTH;
 
   if (y0) {
-    if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 2), (state->tile->frame->width - x0 - width* 2) / 2);
+    if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 4), (state->tile->frame->width - x0 - width* 2) / 4);
     for (; available_above_right < width / 2; available_above_right++) {
       int x_extension = x_scu + width * 2 + 4 * available_above_right;
-      x_extension >>= tree_type == UVG_CHROMA_T;
-      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, (y_scu >> (tree_type==UVG_CHROMA_T)) - 4);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, (y_scu) - 4);
       if (x_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break;
     }
     if(y_scu == 0) {
@@ -547,13 +581,12 @@ static void predict_cclm(
   }
 
   if(x0) {
-    if (x_scu == 0) available_left_below = MIN(MIN(width / 2, (64 - y_scu - height * 2) / 2), (state->tile->frame->height - y0 - height * 2) / 2);
+    if (x_scu == 0) available_left_below = MIN(MIN(height / 2, (64 - y_scu - height * 2) / 4), (state->tile->frame->height - y0 - height * 2) / 4);
     for (; available_left_below < height / 2; available_left_below++) {
       int y_extension = y_scu + height * 2 + 4 * available_left_below;
-      y_extension >>= tree_type == UVG_CHROMA_T;
-      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu >> (tree_type == UVG_CHROMA_T)) - 4, y_extension);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu) - 4, y_extension);
       if (y_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break;
-      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
+      if(x_scu == 32 && y_scu == 0 && pu->log2_height == 6 && pu->log2_width == 6 ) break;
     }
     for(int i = 0; i < height + available_left_below * 2; i++) {
       sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1];
@@ -573,12 +606,18 @@ static void predict_cclm(
 }
 
 
-int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a) {
+uint8_t uvg_get_mip_flag_context(
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  cu_array_t* const cu_a) {
   assert(!(lcu && cu_a));
-  if (width > 2 * height || height > 2 * width) {
+  if (cu_loc->width > 2 * cu_loc->height || cu_loc->height > 2 * cu_loc->width) {
     return 3;
   }
-  
+
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
   int context = 0;
   const cu_info_t* left = NULL;
   const cu_info_t* top = NULL;
@@ -898,39 +937,77 @@ static void mip_predict(
 }
 
 
+int8_t uvg_wide_angle_correction(
+  int_fast8_t mode,
+  const int log2_width,
+  const int log2_height,
+  const
+  bool account_for_dc_planar)
+{
+  int8_t pred_mode = mode;
+  if (log2_width != log2_height) {
+    if (mode > 1 && mode <= 66) {
+      const int modeShift[] = { 0, 6, 10, 12, 14, 15 };
+      const int deltaSize = abs(log2_width - log2_height);
+      if (log2_width > log2_height && mode < 2 + modeShift[deltaSize]) {
+        pred_mode += (66 - 1);
+      }
+      else if (log2_height > log2_width && mode > 66 - modeShift[deltaSize]) {
+        pred_mode -= (66 - 1) + (account_for_dc_planar ? 2 : 0);
+      }
+    }
+  }
+  return pred_mode;
+}
+
 static void intra_predict_regular(
   const encoder_state_t* const state,
   uvg_intra_references *refs,
-  int_fast8_t log2_width,
+  const cu_info_t* const       cur_cu,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
   int_fast8_t mode,
   color_t color,
   uvg_pixel *dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode)
 {
-  const int_fast8_t width = 1 << log2_width;
+  const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
   const uvg_config *cfg = &state->encoder_control->cfg;
 
   // MRL only for luma
   uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t isp = color == COLOR_Y ? isp_mode : 0;
+
+  // Wide angle correction
+  int8_t pred_mode = uvg_wide_angle_correction(
+    mode,
+    color == COLOR_Y ? cur_cu->log2_width : log2_width,
+    color == COLOR_Y ? cur_cu->log2_height : log2_height,
+    false
+    );
 
   const uvg_intra_ref *used_ref = &refs->ref;
-  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || width == 4 || multi_ref_index) {
+  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
   } else if (mode == 0) {
     // Otherwise, use filtered for planar.
-    if (width * width > 32) {
+    if (width * height > 32) {
       used_ref = &refs->filtered_ref;
     }
   } else {
     // Angular modes use smoothed reference pixels, unless the mode is close
     // to being either vertical or horizontal.
     static const int uvg_intra_hor_ver_dist_thres[8] = {24, 24, 24, 14, 2, 0, 0, 0 };
-    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_width) >> 1];
-    int dist_from_vert_or_hor = MIN(abs(mode - 50), abs(mode - 18));
+    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
+    int dist_from_vert_or_hor = MIN(abs(pred_mode - 50), abs(pred_mode - 18));
     if (dist_from_vert_or_hor > filter_threshold) {
 
       static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
-      const int_fast8_t mode_disp = (mode >= 34) ? mode - 50 : 18 - mode;
+      const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
       const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
       if ((abs(sample_disp) & 0x1F) == 0) {
         used_ref = &refs->filtered_ref;
@@ -939,38 +1016,66 @@ static void intra_predict_regular(
   }
 
   if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
-    intra_filter_reference(log2_width, refs);
+    intra_filter_reference(log2_width, log2_height, refs);
   }
 
   if (mode == 0) {
-    uvg_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
+    uvg_intra_pred_planar(pu_loc, color, used_ref->top, used_ref->left, dst);
   } else if (mode == 1) {
-    intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst, multi_ref_index);
+    intra_pred_dc(pu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
   } else {
-    uvg_angular_pred(log2_width, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
+    uvg_angular_pred(
+      pu_loc,
+      pred_mode,
+      color,
+      used_ref->top,
+      used_ref->left,
+      dst,
+      multi_ref_index,
+      isp,
+      isp_mode == ISP_MODE_HOR ? cu_loc->height : cu_loc->width);
   }
 
   // pdpc
   // bool pdpcCondition = (mode == 0 || mode == 1 || mode == 18 || mode == 50);
   bool pdpcCondition = (mode == 0 || mode == 1); // Planar and DC
+  pdpcCondition &= width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH;
   if (pdpcCondition && multi_ref_index == 0) // Cannot be used with MRL.
   {
-    uvg_pdpc_planar_dc(mode, width, log2_width, used_ref, dst);
+    uvg_pdpc_planar_dc(mode, pu_loc, color, used_ref, dst);
   }
 }
 
 
 void uvg_intra_build_reference_any(
-  const int_fast8_t log2_width,
+  const encoder_state_t* const state,
+  const cu_loc_t* const pu_loc,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
   const lcu_t *const lcu,
   uvg_intra_references *const refs,
   const uint8_t multi_ref_idx,
-  uvg_pixel *extra_ref_lines)
+  uvg_pixel *extra_ref_lines,
+  const uint8_t isp_mode)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  // These are only used with ISP, so no need to check chroma
+  const int cu_width  = cu_loc->width;
+  const int cu_height = cu_loc->height;
+  const int pu_x = pu_loc->x;
+  const int pu_y = pu_loc->y;
+  const int cu_x = cu_loc->x;
+  const int cu_y = cu_loc->y;
+
+  bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false;
+
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
 
   refs->filtered_initialized = false;
   uvg_pixel *out_left_ref = &refs->ref.left[0];
@@ -978,8 +1083,7 @@ void uvg_intra_build_reference_any(
 
   const uvg_pixel dc_val = 1 << (UVG_BIT_DEPTH - 1); //TODO: add used bitdepth as a variable
   const int is_chroma = color != COLOR_Y ? 1 : 0;
-  // TODO: height for non-square blocks
-  const int_fast8_t width = 1 << log2_width;
+  const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap;
 
   // Get multi ref index from CU under prediction or reconstrcution. Do not use MRL if not luma
   const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
@@ -1038,12 +1142,24 @@ void uvg_intra_build_reference_any(
   // Generate left reference.
   if (luma_px->x > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+    int px_available_left;
+    if (isp_mode && !is_first_isp_block && !is_chroma) {
+      if (isp_mode == ISP_MODE_VER) {
+        px_available_left = height;
+      }
+      else {
+        px_available_left = uvg_count_available_edge_cus(cu_loc, lcu, true) * 4;
+        px_available_left -= pu_loc->y - cu_loc->y;
+      }
+    }
+    else {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+      px_available_left = !is_chroma ? num_cus * 4 : num_cus * 2;
+    }
 
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
-    // TODO: height for non-square blocks
-    px_available_left = MIN(px_available_left, width * 2 + multi_ref_index);
+    px_available_left = MIN(px_available_left, cu_height * 2 + multi_ref_index);
     px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
     // Copy pixels from coded CUs.
@@ -1053,13 +1169,18 @@ void uvg_intra_build_reference_any(
     }
     // Extend the last pixel for the rest of the reference values.
     uvg_pixel nearest_pixel = left_border[(px_available_left - 1) * left_stride];
-    for (int i = px_available_left; i < width * 2 + multi_ref_index * 2; ++i) {
+
+    // If first isp split, take samples as if it were normal square block
+    int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2);
+    for (int i = px_available_left; i < tmp_h + multi_ref_index * 2; ++i) {
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
   } else {
     // If we are on the left edge, extend the first pixel of the top row.
     uvg_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
-    for (int i = 0; i < width * 2 + multi_ref_index; i++) {
+    // If first isp split, take samples as if it were normal square block
+    int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2);
+    for (int i = 0; i < tmp_h + multi_ref_index; i++) {
       // Reserve space for top left reference
       out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
@@ -1142,13 +1263,26 @@ void uvg_intra_build_reference_any(
   }
 
   // Generate top reference.
+  int px_available_top;
   if (luma_px->y > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
-
+    if (isp_mode && !is_first_isp_block && !is_chroma) {
+      if (isp_mode == ISP_MODE_HOR) {
+        px_available_top = width;
+      }
+      else {
+      px_available_top = uvg_count_available_edge_cus(cu_loc, lcu, false) * 4;
+      px_available_top -= pu_loc->x - cu_loc->x;
+      }
+    }
+    else {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
+      px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2;
+    }
+    
     // Limit the number of available pixels based on block size and dimensions
     // of the picture.
-    px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
+    px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index);
     px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);
 
     // Copy all the pixels we can.
@@ -1157,20 +1291,28 @@ void uvg_intra_build_reference_any(
     }
     // Extend the last pixel for the rest of the reference values.
     uvg_pixel nearest_pixel = top_border[px_available_top - 1];
-    for (int i = px_available_top; i < width * 2 + multi_ref_index * 2; ++i) {
+
+    // If first isp split, take samples as if it were normal square block
+    int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2);
+    for (int i = px_available_top; i < tmp_w + multi_ref_index * 2; ++i) {
       out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
     }
   } else {
     // Extend nearest pixel.
     uvg_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
-    for (int i = 0; i < width * 2 + multi_ref_index; i++) {
+
+    // If first isp split, take samples as if it were normal square block
+    int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2);
+    for (int i = 0; i < tmp_w + multi_ref_index * 2; i++) {
       out_top_ref[i + 1] = nearest_pixel;
     }
   }
 }
 
 void uvg_intra_build_reference_inner(
-  const int_fast8_t log2_width,
+  const encoder_state_t* const state,
+  const cu_loc_t* const pu_loc,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
@@ -1178,17 +1320,33 @@ void uvg_intra_build_reference_inner(
   uvg_intra_references *const refs,
   bool entropy_sync,
   const uint8_t multi_ref_idx,
-  uvg_pixel* extra_ref_lines)
+  uvg_pixel* extra_ref_lines,
+  uint8_t isp_mode)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+  const int cu_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int cu_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  // These are only used with ISP, so no need to check chroma
+  const int pu_x = pu_loc->x;
+  const int pu_y = pu_loc->y;
+  const int cu_x = cu_loc->x;
+  const int cu_y = cu_loc->y;
+
+  bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false;
+
+  // Log2_dim 1 is possible with ISP blocks
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
 
   refs->filtered_initialized = false;
   uvg_pixel * __restrict out_left_ref = &refs->ref.left[0];
   uvg_pixel * __restrict out_top_ref = &refs->ref.top[0];
 
   const int is_chroma = color != COLOR_Y ? 1 : 0;
-  // TODO: height for non-sqaure blocks
-  const int_fast8_t width = 1 << log2_width;
+  const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap;
 
   // Get multiRefIdx from CU under prediction. Do not use MRL if not luma
   const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
@@ -1288,27 +1446,59 @@ void uvg_intra_build_reference_inner(
   }
   // Generate left reference.
 
-// Get the number of reference pixels based on the PU coordinate within the LCU.
-  int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+  // Get the number of reference pixels based on the PU coordinate within the LCU.
+  int px_available_left;
+  if (isp_mode && !is_first_isp_block && !is_chroma) {
+    if (isp_mode == ISP_MODE_VER) {
+      px_available_left = height;
+    }
+    else {
+      px_available_left = uvg_count_available_edge_cus(cu_loc, lcu, true) * 4;
+      px_available_left -= pu_loc->y - cu_loc->y;
+    }
+
+  }
+  else {
+    if(!is_dual_tree) {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+      px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2;
+    } else {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true);
+      px_available_left = !is_chroma ? num_cus * 4 : num_cus * 2;
+    }
+  }
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
-  px_available_left = MIN(px_available_left, width * 2);
+  px_available_left = MIN(px_available_left, cu_height * 2);
   px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
   // Copy pixels from coded CUs.
   int i = multi_ref_index;  // Offset by multi_ref_index
-  do {
-    out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
-    out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
-    out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride];
-    out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride];
-    i += 4;
-  } while (i < px_available_left);
+  
+  // Do different loop for heights smaller than 4 (possible for some ISP splits)
+  if (px.y % 4 != 0 || px_available_left < 4) {
+    do {
+      out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
+      i += 1;
+    } while (i < px_available_left);
+  }
+  else {
+    do {
+      out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
+      out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
+      out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride];
+      out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride];
+      i += 4;
+    } while (i < px_available_left);
+  }
 
   // Extend the last pixel for the rest of the reference values.
   uvg_pixel nearest_pixel = out_left_ref[i];
-  for (; i < width * 2; i += 4) {
+
+  // If first isp split, take samples as if it were normal square block
+  int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2);
+  for (; i < tmp_h; i += 4) {
     out_left_ref[i + 1] = nearest_pixel;
     out_left_ref[i + 2] = nearest_pixel;
     out_left_ref[i + 3] = nearest_pixel;
@@ -1317,7 +1507,7 @@ void uvg_intra_build_reference_inner(
 
   // Extend for MRL
   if (multi_ref_index) {
-    for (; i < width * 2 + multi_ref_index; ++i) {
+    for (; i < height * 2 + multi_ref_index; ++i) {
       out_left_ref[i + 1] = nearest_pixel;
     }
   }
@@ -1325,11 +1515,24 @@ void uvg_intra_build_reference_inner(
   // Generate top reference.
 
   // Get the number of reference pixels based on the PU coordinate within the LCU.
-  int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+  int px_available_top;
+  if (isp_mode && !is_first_isp_block && !is_chroma) {
+    if (isp_mode == ISP_MODE_HOR) {
+      px_available_top = width;
+    }
+    else {
+      px_available_top = uvg_count_available_edge_cus(cu_loc, lcu, false) * 4;
+      px_available_top -= pu_loc->x - cu_loc->x;
+    }
+  }
+  else {
+      const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false);
+      px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2;
+  }
 
   // Limit the number of available pixels based on block size and dimensions
   // of the picture.
-  px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
+  px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index);
   px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);
 
   if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x) -1);
@@ -1343,7 +1546,10 @@ void uvg_intra_build_reference_inner(
 
   // Extend the last pixel for the rest of the reference values.
   nearest_pixel = out_top_ref[i + multi_ref_index];
-  for (; i < (width + multi_ref_index) * 2; i += 4) {
+
+  // If first isp split, take samples as if it were normal square block
+  int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2);
+  for (; i < tmp_w + (multi_ref_index * 2); i += 4) {
     out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
     out_top_ref[i + 2 + multi_ref_index] = nearest_pixel;
     out_top_ref[i + 3 + multi_ref_index] = nearest_pixel;
@@ -1351,8 +1557,11 @@ void uvg_intra_build_reference_inner(
   }
 }
 
+
 void uvg_intra_build_reference(
-  const int_fast8_t log2_width,
+  const encoder_state_t* const state,
+  const cu_loc_t* const pu_loc,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
@@ -1360,15 +1569,19 @@ void uvg_intra_build_reference(
   uvg_intra_references *const refs,
   bool entropy_sync,
   uvg_pixel *extra_ref_lines,
-  uint8_t multi_ref_idx)
+  uint8_t multi_ref_idx,
+  const uint8_t isp_mode)
 {
   assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references.");
 
+  //bool first_split = color == COLOR_Y && isp_mode && pu_loc->x == cu_loc->x && pu_loc->y == cu_loc->y;
+  //uint8_t isp = first_split ? 0 : isp_mode;
+
   // Much logic can be discarded if not on the edge
   if (luma_px->x > 0 && luma_px->y > 0) {
-    uvg_intra_build_reference_inner(log2_width, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines);
+    uvg_intra_build_reference_inner(state, pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp_mode);
   } else {
-    uvg_intra_build_reference_any(log2_width, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines);
+    uvg_intra_build_reference_any(state, pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp_mode);
   }
 }
 
@@ -1377,21 +1590,21 @@ void uvg_intra_predict(
   const encoder_state_t* const state,
   uvg_intra_references* const refs,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
   const color_t color,
   uvg_pixel* dst,
   const intra_search_data_t* data,
-  const lcu_t* lcu,
-  enum uvg_tree_type tree_type
-  )
+  const lcu_t* lcu
+)
 {
   const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
   // TODO: what is this used for?
   // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
   bool use_mip = false;
-  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
-  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
-  const int x = cu_loc->x;
-  const int y = cu_loc->y;
+  const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+  const int x = pu_loc->x;
+  const int y = pu_loc->y;
   int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma;
   if (data->pred_cu.intra.mip_flag) {
     if (color == COLOR_Y) {
@@ -1407,68 +1620,153 @@ void uvg_intra_predict(
       mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
     }
     else {
-      intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
+      intra_predict_regular(state, refs, &data->pred_cu, cu_loc, pu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode);
     }
   }
   else {
-    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
-    if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
+    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, height, stride / 2, width);
+    if (width != 1 << data->pred_cu.log2_chroma_width || height != 1 << data->pred_cu.log2_chroma_height || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
       predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, 
-        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
-        tree_type);
+        state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
+        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]);
     }
     else {
-      linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width);
+      linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, height);
     }
   }
 }
 
 // This function works on luma coordinates 
-const cu_info_t* uvg_get_co_located_luma_cu(
-  int x,
-  int y,
-  int width,
-  int height,
+int8_t uvg_get_co_located_luma_mode(
+  const cu_loc_t* const chroma_loc,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t* luma_cu,
   const lcu_t* const lcu,
   const cu_array_t* const cu_array,
   enum uvg_tree_type tree_type)
 {
+  int x = chroma_loc->x;
+  int y = chroma_loc->y;
   assert((cu_array || lcu) && !(cu_array && lcu));
   assert(tree_type != UVG_LUMA_T && "Luma only CU shouldn't need colocated luma CU");
   if(tree_type == UVG_CHROMA_T) {
-    x += width >> 1;
-    y += height >> 1;
+    x += chroma_loc->width >> 1;
+    y += chroma_loc->height >> 1;
   }
-  if(cu_array) {
-    return uvg_cu_array_at_const(cu_array, x, y);
+  const cu_info_t* cu;
+  if (lcu && cu_loc->x <= x && x < cu_loc->x + cu_loc->width && cu_loc->y <= y && y < cu_loc->y + cu_loc->height) {
+    cu = luma_cu;
+  }
+  else if(cu_array) {
+    cu = uvg_cu_array_at_const(cu_array, x, y);
   }
   else {
-    return LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+    cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
   }
+  if (cu->intra.mip_flag) {
+    return 0;
+  }
+  return cu->intra.mode;
+}
+
+
+
+
+/**
+* \brief Returns ISP split partition size based on block dimensions and split type.
+*
+* Returns ISP split partition size based on block dimensions and split type.
+* Will fail if resulting partition size has less than 16 samples.
+*
+* \param width        Block width.
+* \param height       Block height.
+* \param split_type   Horizontal or vertical split.
+*/
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_split)
+{
+  assert(split_type != ISP_MODE_NO_ISP && "Cannot calculate split dimension if no split type is set. Make sure this function is not called in this case.");
+
+  bool divide_in_rows = split_type == SPLIT_TYPE_HOR;
+  int split_dim_size, non_split_dim_size, partition_size, div_shift = 2;
+
+  if (divide_in_rows) {
+    split_dim_size = height;
+    non_split_dim_size = width;
+  }
+  else {
+    split_dim_size = width;
+    non_split_dim_size = height;
+  }
+
+  const int min_num_samples = 16; // Minimum allowed number of samples for split block
+  const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1;
+  partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift);
+
+  // Minimum width for ISP splits are 4. (JVET-T2001 chapter 8.4.5.1 equation 246: nPbW = Max(4, nW)) 
+  // Except this does not apply for transform blocks for some reason. VTM does seem to expect 4 transform blocks even if only two pred blocks were used
+  // Height can be 2.
+  if (!divide_in_rows && !is_transform_split) {
+    partition_size = MAX(4, partition_size);
+  }
+
+  assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) >= uvg_math_floor_log2(min_num_samples)) &&
+    "Partition has less than allowed minimum number of samples.");
+  return partition_size;
+}
+
+
+int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_split)
+{
+  assert((split_type != ISP_MODE_NO_ISP) && "This function cannot be called if ISP mode is 0.");
+  int split_dim = uvg_get_isp_split_dim(width, height, split_type, is_transform_split);
+  int num = split_type == ISP_MODE_HOR ? height / split_dim : width / split_dim;
+
+  return num;
+}
+
+
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_split)
+{
+  // Check for illegal splits
+  assert(!(block_w == 4 && block_h == 4) || split_idx == 0 && "Trying to get ISP split CU when split is not allowed.");
+  assert(!((block_w * block_h) <= 16) || split_idx < 2 && "Split index for small blocks must be in [0, 1]");
+  assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3].");
+  assert((split_type != ISP_MODE_NO_ISP || split_idx == 0) && "Trying to ISP split when split type = NO_ISP.");
+  int part_dim = block_w;
+  if (split_type != ISP_MODE_NO_ISP) {
+    part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type, is_transform_split);
+  }
+  if(split_type == ISP_MODE_VER && block_w < 16 && block_h != 4 && !is_transform_split) {
+    split_idx /= 2;
+  }
+  const int offset = part_dim * split_idx;
+
+  const int part_x = split_type == ISP_MODE_HOR ? x : x + offset;
+  const int part_y = split_type == ISP_MODE_HOR ? y + offset : y;
+  const int part_w = split_type == ISP_MODE_HOR ? block_w  : part_dim;
+  const int part_h = split_type == ISP_MODE_HOR ? part_dim : block_h;
+
+  uvg_cu_loc_ctor(loc, part_x, part_y, part_w, part_h);
 }
 
 
 static void intra_recon_tb_leaf(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* pu_loc,
+  const cu_loc_t* cu_loc,
   lcu_t *lcu,
   color_t color,
-  const intra_search_data_t* search_data,
-  enum uvg_tree_type tree_type)
+  const intra_search_data_t* search_data)
 {
   const uvg_config *cfg = &state->encoder_control->cfg;
   const int shift = color == COLOR_Y ? 0 : 1;
 
-  int log2width = LOG2_LCU_WIDTH - depth;
-  if (color != COLOR_Y && depth < MAX_PU_DEPTH) {
-    // Chroma width is half of luma width, when not at maximum depth.
-    log2width -= 1;
-  }
-  const int width = 1 << log2width;
-  const int height = width; // TODO: proper height for non-square blocks
+  const int x = pu_loc->x;
+  const int y = pu_loc->y;
+  
+  const int width  = color == COLOR_Y ? pu_loc->width  : pu_loc->chroma_width;
+  const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height;
+
   const int lcu_width = LCU_WIDTH >> shift;
 
   const vector2d_t luma_px = { x, y };
@@ -1480,8 +1778,10 @@ static void intra_recon_tb_leaf(
   int y_scu = SUB_SCU(y);
   const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
   uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0;
+  uint8_t isp_mode = color == COLOR_Y ? search_data->pred_cu.intra.isp_mode : 0;
 
   uvg_intra_references refs;
+
   // Extra reference lines for use with MRL. Extra lines needed only for left edge.
   uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 };
 
@@ -1490,26 +1790,20 @@ static void intra_recon_tb_leaf(
 
     // Copy extra ref lines, including ref line 1 and top left corner.
     for (int i = 0; i < MAX_REF_LINE_IDX; ++i) {
-      int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX;
-      height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
-      height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
+      int ref_height = height * 2 + MAX_REF_LINE_IDX;
+      ref_height = MIN(ref_height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
+      ref_height = MIN(ref_height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
       uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)],
         &extra_refs[i * 128],
-        1, height,
+        1, ref_height,
         frame->rec->stride, 1);
     }
   }
-  uvg_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
+
+  uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
 
   uvg_pixel pred[32 * 32];
-
-  cu_loc_t loc = {
-    x, y,
-    width, height,
-    width, height,
-  };
-
-  uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu, tree_type);
+  uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixel *block = NULL;
@@ -1529,12 +1823,13 @@ static void intra_recon_tb_leaf(
     default: break;
   }
 
-  uvg_pixels_blit(pred, block , width, width, width, lcu_width);
+  uvg_pixels_blit(pred, block , width, height, width, lcu_width);
   if(color != COLOR_Y && cfg->jccr) {
-    uvg_pixels_blit(pred, block2, width, width, width, lcu_width);
+    uvg_pixels_blit(pred, block2, width, height, width, lcu_width);
   }
 }
 
+
 /**
  * \brief Reconstruct an intra CU
  *
@@ -1552,79 +1847,219 @@ static void intra_recon_tb_leaf(
  */
 void uvg_intra_recon_cu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
   intra_search_data_t* search_data,
+  const cu_loc_t* cu_loc,
   cu_info_t *cur_cu,
   lcu_t *lcu,
   enum uvg_tree_type tree_type,
   bool recon_luma,
   bool recon_chroma)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x) >> (tree_type == UVG_CHROMA_T), SUB_SCU(y) >> (tree_type == UVG_CHROMA_T) };
-  const int8_t width = LCU_WIDTH >> depth;
+  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  const vector2d_t lcu_px = {
+    cu_loc->local_x,
+    cu_loc->local_y,
+  };
+  const int8_t width = cu_loc->width;
+  const int8_t height = cu_loc->height;
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
-  if(!recon_luma && recon_chroma) {
-    x &= ~7;
-    y &= ~7;
-  }
-  
+   
   // Reset CBFs because CBFs might have been set
   // for depth earlier
   if (recon_luma) {
-    cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
+    cbf_clear(&cur_cu->cbf, COLOR_Y);
   }
   if (recon_chroma) {
-    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_cu->cbf, COLOR_U);
+    cbf_clear(&cur_cu->cbf, COLOR_V);
   }
 
-  if (depth == 0 || cur_cu->tr_depth > depth) {
-
-    const int offset = width / 2;
-    const int32_t x2 = x + offset;
-    const int32_t y2 = y + offset;
-
-    uvg_intra_recon_cu(state, x,   y,   depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_intra_recon_cu(state, x2,  y,   depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_intra_recon_cu(state, x,   y2,  depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-    uvg_intra_recon_cu(state, x2,  y2,  depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma);
-
-    // Propagate coded block flags from child CUs to parent CU.
-    uint16_t child_cbfs[3] = {
-      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), lcu_px.y >> (tree_type == UVG_CHROMA_T))->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf,
-      LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf,
-    };
-
-    if (recon_luma && depth <= MAX_DEPTH) {
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
+  if (width > TR_MAX_WIDTH || height > TR_MAX_WIDTH) {
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
     }
-    if (recon_chroma && depth <= MAX_DEPTH) {
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
     }
-  } else {
-    const bool has_luma = recon_luma;
-    const bool has_chroma = recon_chroma && (x % 8 == 0 && y % 8 == 0);
-   
-    // Process a leaf TU.
-    if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data, tree_type);
-    }
-    if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data, tree_type);
-      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data, tree_type);
+    else {
+      split = BT_HOR_SPLIT;
     }
 
-    uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
-                              search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
-                              x, y, depth, cur_cu, lcu,
-                              false,
-      tree_type);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      uvg_intra_recon_cu(
+        state, search_data, &split_cu_loc[i],
+        NULL, lcu,
+        state->encoder_control->cfg.dual_tree && state->frame->slicetype == UVG_SLICE_I ? tree_type : UVG_BOTH_T, 
+        recon_luma, recon_chroma);
+    }
+
+    return;
   }
+  if (search_data->pred_cu.intra.isp_mode != ISP_MODE_NO_ISP && recon_luma ) {
+    search_data->best_isp_cbfs = 0;
+    // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
+    // Small blocks are split only twice.
+    int split_type = search_data->pred_cu.intra.isp_mode;
+    int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
+
+    state->quant_blocks[1].needs_init = true;
+
+    for (int i = 0; i < split_limit; ++i) {
+      cu_loc_t tu_loc;
+      uvg_get_isp_split_loc(&tu_loc,  cu_loc->x, cu_loc->y, width, height, i, split_type, true);
+      cu_loc_t pu_loc;
+      uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
+      cur_cu->intra.isp_index = 0;
+      if(tu_loc.x % 4 == 0) {
+        intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
+      }
+      state->rate_estimator[3].needs_init = true;
+      uvg_quantize_lcu_residual(state, true, false, false,
+        &tu_loc, cur_cu, lcu,
+        false, tree_type);
+      search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, COLOR_Y) << i;
+      cur_cu->intra.isp_cbfs = search_data->best_isp_cbfs;
+    }
+  }
+  const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP;
+  const bool has_chroma = recon_chroma;
+     
+  // Process a leaf TU.
+  if (has_luma) {
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_Y, search_data);
+  }
+  if (has_chroma) {
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_U, search_data);
+    intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_V, search_data);
+  }
+
+  // TODO: not necessary to call if only luma and ISP is on
+  uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
+                            search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma,
+                            cu_loc, cur_cu, lcu,
+                            false,
+                            tree_type);
 }
+
+
+/**
+* \brief Check if ISP can be used for block size.
+*
+* \return True if isp can be used.
+* \param width        Block width.
+* \param height       Block height.
+* \param max_tr_size  Maximum supported transform block size (64).
+*/
+bool uvg_can_use_isp(const int width, const int height)
+{
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
+  assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
+
+  const int log2_width = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  // Each split block must have at least 16 samples.
+  bool not_enough_samples = (log2_width + log2_height <= 4);
+  bool cu_size_larger_than_max_tr_size = width > TR_MAX_WIDTH || height > TR_MAX_WIDTH;
+  if (not_enough_samples || cu_size_larger_than_max_tr_size) {
+    return false;
+  }
+  return true;
+}
+
+
+/**
+* \brief Check if given ISP mode can be used with LFNST.
+*
+* \return True if isp can be used.
+* \param width        Block width.
+* \param height       Block height.
+* \param isp_mode     ISP mode.
+* \param tree_type    Tree type. Dual, luma or chroma tree.
+*/
+bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_split_type, const enum uvg_tree_type tree_type)
+{
+  if (tree_type == UVG_CHROMA_T) {
+    return false;
+  }
+  if (isp_split_type == ISP_MODE_NO_ISP) {
+    return true;
+  }
+
+  const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER, true);
+  const int tu_height = (isp_split_type == ISP_MODE_HOR) ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR, true) : height;
+
+  if (!(tu_width >= TR_MIN_WIDTH && tu_height >= TR_MIN_WIDTH))
+  {
+    return false;
+  }
+  return true;
+}
+
+
+double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
+                                       const cu_loc_t* const cu_loc,
+                                       double cost_treshold,
+                                       intra_search_data_t* const search_data,
+                                       lcu_t* const lcu, bool* violates_lfnst) {
+  assert(state->search_cabac.update && "ISP reconstruction must be done with CABAC update");
+  double cost = 0;
+
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+
+  search_data->best_isp_cbfs = 0;
+  search_data->pred_cu.intra.isp_cbfs = 0;
+  // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions.
+  // Small blocks are split only twice.
+  int split_type = search_data->pred_cu.intra.isp_mode;
+  int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
+
+  int cbf_context = 2;
+  state->quant_blocks[1].needs_init = true;
+
+  for (int i = 0; i < split_limit; ++i) {
+    search_data->pred_cu.intra.isp_index = i;
+    cu_loc_t tu_loc;
+    uvg_get_isp_split_loc(&tu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
+    cu_loc_t pu_loc;
+    uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false);
+    if (tu_loc.x % 4 == 0) {
+      intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data);
+    }
+
+    state->rate_estimator[3].needs_init = true;
+    uvg_quantize_lcu_residual(state, true, false, false,
+      &tu_loc, &search_data->pred_cu, lcu,
+      false, UVG_LUMA_T);
+
+    int index = tu_loc.local_y * LCU_WIDTH + tu_loc.local_x;
+    int ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+      LCU_WIDTH, LCU_WIDTH,
+      tu_loc.width, tu_loc.height);
+    double coeff_bits = uvg_get_coeff_cost(state, lcu->coeff.y, &search_data->pred_cu, &tu_loc, 0, SCAN_DIAG, false, COEFF_ORDER_CU);
+
+
+    int cbf = cbf_is_set(search_data->pred_cu.cbf, COLOR_Y);
+    if (i + 1 != split_limit || search_data->best_isp_cbfs != 0) {
+      CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_luma[cbf_context], cbf, coeff_bits, "cbf_luma_isp_recon");
+    }
+    cost += ssd + coeff_bits * state->lambda;
+
+    cbf_context = 2 + cbf;
+    if(violates_lfnst) *violates_lfnst |= search_data->pred_cu.violates_lfnst_constrained_luma;
+    search_data->pred_cu.violates_lfnst_constrained_luma = false;
+
+    search_data->best_isp_cbfs |= cbf << i;
+    search_data->pred_cu.intra.isp_cbfs = search_data->best_isp_cbfs;
+
+  }
+  search_data->pred_cu.intra.isp_index = 0;
+  return cost;
+}
\ No newline at end of file
diff --git a/src/intra.h b/src/intra.h
index a2ffa230..c15b182a 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -71,6 +71,7 @@ typedef struct {
   double coeff_bits;
   double distortion;
   double lfnst_costs[3];
+  uint8_t best_isp_cbfs;
 } intra_search_data_t ;
 
 
@@ -107,7 +108,9 @@ int8_t uvg_intra_get_dir_luma_predictor(
 * \param multi_ref_idx Multi reference line index for the prediction block.
 */
 void uvg_intra_build_reference(
-  const int_fast8_t log2_width,
+  const encoder_state_t* const state,
+  const cu_loc_t* const pu_loc,
+  const cu_loc_t* const cu_loc,
   const color_t color,
   const vector2d_t *const luma_px,
   const vector2d_t *const pic_px,
@@ -115,7 +118,8 @@ void uvg_intra_build_reference(
   uvg_intra_references *const refs,
   bool entropy_sync,
   uvg_pixel *extra_refs,
-  uint8_t multi_ref_idx);
+  uint8_t multi_ref_idx,
+  const uint8_t isp_mode);
 
 /**
  * \brief Generate intra predictions.
@@ -130,32 +134,60 @@ void uvg_intra_predict(
   const encoder_state_t* const state,
   uvg_intra_references* const refs,
   const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
   const color_t color,
   uvg_pixel* dst,
   const intra_search_data_t* data,
-  const lcu_t* lcu,
-  enum uvg_tree_type tree_type
-  );
+  const lcu_t* lcu
+);
 
 void uvg_intra_recon_cu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
   intra_search_data_t* search_data,
+  const cu_loc_t* cu_loc,
   cu_info_t *cur_cu,
   lcu_t *lcu,
   enum uvg_tree_type tree_type,
   bool recon_luma,
   bool recon_chroma);
 
-const cu_info_t* uvg_get_co_located_luma_cu(
-  int x,
-  int y,
-  int width,
-  int height,
+double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
+                                       const cu_loc_t* const cu_loc,
+                                       double cost_treshold,
+                                       intra_search_data_t* const search_data,
+                                       lcu_t* const lcu, bool* violates_lfnst);
+
+int8_t uvg_get_co_located_luma_mode(
+  const cu_loc_t* const chroma_loc,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t* luma_cu,
   const lcu_t* const lcu,
   const cu_array_t* const cu_array,
   enum uvg_tree_type tree_type);
+bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t* const luma_loc, cu_info_t const* const cur_cu, enum
+                         uvg_tree_type tree_type);
 
-int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
+uint8_t uvg_get_mip_flag_context(
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  cu_array_t* const cu_a);
+
+int8_t uvg_wide_angle_correction(
+  int_fast8_t mode,
+  const int log2_width,
+  const int log2_height,
+  const bool account_for_dc_planar);
+
+// ISP related defines
+#define NUM_ISP_MODES 3
+#define ISP_MODE_NO_ISP 0
+#define ISP_MODE_HOR 1
+#define ISP_MODE_VER 2
+#define SPLIT_TYPE_HOR 1
+#define SPLIT_TYPE_VER 2
+
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_block);
+int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_block);
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_block);
+bool uvg_can_use_isp(const int width, const int height);
+bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
diff --git a/src/rate_control.c b/src/rate_control.c
index 67570565..3dfa35fe 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -795,12 +795,20 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
       state->frame->QP + 2 + frame_allocation,
       est_qp);
   }
+  if(state->encoder_control->cfg.dep_quant) {
+    est_lambda *= pow(2, 0.25 / 3.0);
+  }
 
   state->lambda = est_lambda;
   state->lambda_sqrt = sqrt(est_lambda);
   state->qp = est_qp;
   int8_t chroma_qp = encoder->qp_map[0][est_qp];
   double tmpWeight = pow(2.0, (est_qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant)
+  {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0));  // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
   state->c_lambda = est_lambda / tmpWeight;
   ctu->qp = est_qp;
   ctu->lambda = est_lambda;
@@ -820,7 +828,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
     // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
     state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
     state->qp = CLIP_TO_QP(state->qp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
     state->lambda_sqrt = sqrt(state->lambda);
     
     ctu->adjust_lambda = state->lambda;
@@ -1103,7 +1115,12 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
       pos.x = 0;
     }
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
     state->lambda_sqrt = sqrt(state->lambda);
   }
   else if (ctrl->cfg.target_bitrate > 0) {
@@ -1138,6 +1155,9 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
                   state->frame->lambda * 1.5874010519681994,
                   lambda);
     lambda = clip_lambda(lambda);
+    if (state->encoder_control->cfg.dep_quant) {
+      lambda *= pow(2, 0.25 / 3.0);
+    }
 
     state->lambda      = lambda;
     state->lambda_sqrt = sqrt(lambda);
@@ -1145,8 +1165,13 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
 
   } else {
     state->qp          = state->frame->QP;
-    state->lambda      = state->frame->lambda;
-    state->lambda_sqrt = sqrt(state->frame->lambda);
+    double lambda = state->frame->lambda;
+
+    if (state->encoder_control->cfg.dep_quant) {
+      lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda      = lambda;
+    state->lambda_sqrt = sqrt(lambda);
   }
 
   lcu->lambda = state->lambda;
@@ -1154,6 +1179,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
 
   int8_t chroma_qp = ctrl->qp_map[0][state->qp];
   double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant)
+  {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0));  // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
   state->c_lambda = state->lambda / tmpWeight;
 
   // Apply variance adaptive quantization
@@ -1170,10 +1200,34 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
     // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
     state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
     state->qp = CLIP_TO_QP(state->qp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
     state->lambda_sqrt = sqrt(state->lambda);
 
     lcu->adjust_lambda = state->lambda;
     lcu->adjust_qp = state->qp;
   }
 }
+
+
+double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode)
+{
+  const encoder_control_t * const ctrl = state->encoder_control;
+  double lambda = state->lambda;
+  int8_t chroma_qp = ctrl->qp_map[0][state->qp];
+  double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant) {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  lambda /= tmpWeight;
+  lambda *= use_jccr && state->qp > 18 ? 1.3 : 1.0;
+  if (jccr_mode == 1 || jccr_mode == 2) {
+    lambda *= 0.8;
+  } else if (jccr_mode == 3) {
+    lambda *= 0.5;
+  }
+  return lambda;
+}
\ No newline at end of file
diff --git a/src/rate_control.h b/src/rate_control.h
index f397e2a2..644d7fc4 100644
--- a/src/rate_control.h
+++ b/src/rate_control.h
@@ -76,4 +76,6 @@ void uvg_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos);
 void uvg_update_after_picture(encoder_state_t * const state);
 void uvg_estimate_pic_lambda(encoder_state_t * const state);
 
+double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode);
+
 #endif // RATE_CONTROL_H_
diff --git a/src/rdo.c b/src/rdo.c
index f8ebacdf..c5d1c71b 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -33,6 +33,7 @@
 #include "rdo.h"
 
 #include <errno.h>
+#include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
@@ -52,7 +53,6 @@
 #include "strategies/strategies-quant.h"
 
 
-#define QUANT_SHIFT          14
 #define SCAN_SET_SIZE        16
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4
@@ -297,15 +297,20 @@ out:
 static INLINE double get_coeff_cabac_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
-  int32_t width,
+  const cu_loc_t* const cu_loc,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
   cu_info_t* cur_tu)
 {
+  const int width  = cu_loc->width;
+  const int height = cu_loc->height;
+  const int sub_coeff_w = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+
   // Make sure there are coeffs present
   bool found = false;
-  for (int i = 0; i < width*width; i++) {
+  for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) {
     if (coeff[i] != 0) {
       found = 1;
       break;
@@ -330,7 +335,7 @@ static INLINE double get_coeff_cabac_cost(
     uvg_encode_coeff_nxn((encoder_state_t*) state,
                          &cabac_copy,
                          coeff,
-                         width,
+                         cu_loc,
                          color,
                          scan_mode,
                          cur_tu,                   
@@ -341,6 +346,7 @@ static INLINE double get_coeff_cabac_cost(
       &cabac_copy,
       coeff,
       width,
+      height,
       color,
       scan_mode,
       &bits);
@@ -391,14 +397,36 @@ double uvg_get_coeff_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   cu_info_t* cur_tu,
-  int32_t width,
+  const cu_loc_t* const cu_loc,
   color_t color,
   int8_t scan_mode,
-  int8_t tr_skip)
+  int8_t tr_skip,
+  int coeff_order)
 {
   uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
   uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
 
+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  int x_local = cu_loc->x % LCU_WIDTH;
+  int y_local = cu_loc->y % LCU_WIDTH;
+  const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C;
+
+
+  const coeff_t* coeff_ptr = NULL;
+  coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+  if (coeff_order == COEFF_ORDER_LINEAR) {
+    coeff_ptr = coeff;
+  }
+  else {
+    // Coeff order CU
+    uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
+    coeff_ptr = sub_coeff;
+  }
+
   if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
       state->qp < MAX_FAST_COEFF_COST_QP && !tr_skip) {
     // TODO: do we need to assert(0) out of the fast-estimation branch if we
@@ -409,17 +437,17 @@ double uvg_get_coeff_cost(
       return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
     } else {
       uint64_t weights = uvg_fast_coeff_get_weights(state);
-      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
+      uint32_t fast_cost = uvg_fast_coeff_cost(coeff_ptr, width, height, weights);
       if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
     if (save_cccs) {
-      save_ccc(state->qp, coeff, width * width, ccc);
+      save_ccc(state->qp, coeff, width * height, ccc);
     }
     return ccc;
   }
@@ -677,19 +705,20 @@ static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t
  * tables generated during RDOQ to select the best coefficient to change.
  */
 void uvg_rdoq_sign_hiding(
-    const encoder_state_t *const state,
-    const int32_t qp_scaled,
-    const uint32_t *const scan2raster,
-    const struct sh_rates_t *const sh_rates,
-    const int32_t last_pos,
-    const coeff_t *const coeffs,
-    coeff_t *const quant_coeffs, 
-    const int8_t color)
+  const encoder_state_t *const state,
+  const int32_t qp_scaled,
+  const uint32_t *const scan2raster,
+  const struct sh_rates_t *const sh_rates,
+  const int32_t last_pos,
+  const coeff_t *const coeffs,
+  coeff_t *const quant_coeffs,
+  const int8_t color,
+  const bool need_sqrt_adjust)
 {
   const encoder_control_t * const ctrl = state->encoder_control;
   const double lambda = color ? state->c_lambda : state->lambda;
 
-  int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6];
+  int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6];
   // This somehow scales quant_delta into fractional bits. Instead of the bits
   // being multiplied by lambda, the residual is divided by it, or something
   // like that.
@@ -814,28 +843,28 @@ void uvg_rdoq_sign_hiding(
   }
 }
 
-static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t  posX, uint32_t  posY, uint32_t width, uint32_t height)
+static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t  posX, uint32_t  posY, uint32_t width, uint32_t height, uint8_t mts_index)
 {
   const coeff_t* pData = coeff + posX + posY * width;
   coeff_t          sum = 0;
   if (posX < width - 1)
   {
-    sum += abs(pData[1]);
+    sum += mts_index && posX + 1 >= 16 ? 0 : abs(pData[1]);
     if (posX < width - 2)
     {
-      sum += abs(pData[2]);
+      sum += mts_index && posX + 2 >= 16 ? 0 : abs(pData[2]);
     }
     if (posY < height - 1)
     {
-      sum += abs(pData[width + 1]);
+      sum += mts_index && (posY + 1 >= 16 || posX + 1 >= 16) ? 0 : abs(pData[width + 1]);
     }
   }
   if (posY < height - 1)
   {
-    sum += abs(pData[width]);
+    sum += mts_index && posY + 1 >= 16 ? 0 : abs(pData[width]);
     if (posY < height - 2)
     {
-      sum += abs(pData[width << 1]);
+      sum += mts_index && posY + 2 >= 16 ? 0 : abs(pData[width << 1]);
     }
   }
   return MAX(MIN(sum - 5 * baseLevel, 31), 0);
@@ -1141,7 +1170,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   const int  max_log2_tr_dynamic_range = 15;
   uint32_t log2_tr_width = uvg_math_floor_log2(width);
   uint32_t log2_tr_height = uvg_math_floor_log2(height);
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_width = g_log2_sbb_size[log2_tr_width][log2_tr_height][0];
   const uint32_t log2_cg_height = g_log2_sbb_size[log2_tr_width][log2_tr_height][1];
 
@@ -1166,15 +1196,18 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   switch (cg_num) {
   case  1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); FILL_ARRAY(cost_coeffgroup_sig, 0, 1); break;
+  case  2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); FILL_ARRAY(cost_coeffgroup_sig, 0, 2); break;
   case  4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); FILL_ARRAY(cost_coeffgroup_sig, 0, 4);  break;
+  case  8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); FILL_ARRAY(cost_coeffgroup_sig, 0, 8);  break;
   case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); FILL_ARRAY(cost_coeffgroup_sig, 0, 16);  break;
+  case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); FILL_ARRAY(cost_coeffgroup_sig, 0, 32);  break;
   case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); FILL_ARRAY(cost_coeffgroup_sig, 0, 64); break;
   default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
   }
 
   const bool   needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation.
   const int    q_bits = QUANT_SHIFT + qp_scaled / 6  + (needs_sqrt2_scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
-  const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6];
+  const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6];
  
   const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff;
 
@@ -1182,8 +1215,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;
 
-  const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
   uint32_t coeff_levels[3];
   double   coeff_level_error[4];
@@ -1221,8 +1254,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
       scan_pos = (sbId << log2_cg_size) + scan_pos_in_sb;
       int last_pos_coded = sbSizeM1;
       uint32_t blkpos = scan[scan_pos];
-      uint32_t  pos_y = blkpos >> log2_block_size;
-      uint32_t  pos_x = blkpos - (pos_y << log2_block_size);
+      uint32_t  pos_y = blkpos >> log2_block_width;
+      uint32_t  pos_x = blkpos - (pos_y << log2_block_width); 
       //===== quantization =====
 
       // set coeff
@@ -1365,6 +1398,48 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
   return abs_sum;
 }
 
+
+static uint32_t context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
+                                            uint32_t width, uint32_t height, int8_t color,
+                                            int32_t* temp_diag, int32_t* temp_sum, int8_t mts)
+{
+  const coeff_t* data = coeff + pos_x + pos_y * width;
+  const int     diag = pos_x + pos_y;
+  int           num_pos = 0;
+  int           sum_abs = 0;
+#define UPDATE(x) {int a=abs(x);sum_abs+=MIN(4+(a&1),a);num_pos+=(a?1:0);}
+  if (pos_x < width - 1)
+  {
+    UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[1]);
+    if (pos_x < width - 2)
+    {
+      UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[2]);
+    }
+    if (pos_y < height - 1)
+    {
+      UPDATE(mts && (pos_y + 1 >= 16 || pos_x + 1 >= 16) ? 0 : data[width + 1]);
+    }
+  }
+  if (pos_y < height - 1)
+  {
+    UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[width]);
+    if (pos_y < height - 2)
+    {
+      UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[width << 1]);
+    }
+  }
+#undef UPDATE
+  int ctx_ofs = MIN((sum_abs + 1) >> 1, 3) + (diag < 2 ? 4 : 0);
+  if (color == COLOR_Y)
+  {
+    ctx_ofs += diag < 5 ? 4 : 0;
+  }
+
+  *temp_diag = diag;
+  *temp_sum = sum_abs - num_pos;
+  return ctx_ofs;
+}
+
 /** RDOQ with CABAC
  * \returns void
  * Rate distortion optimized quantization for entropy
@@ -1377,31 +1452,35 @@ void uvg_rdoq(
   coeff_t *dest_coeff,
   int32_t width,
   int32_t height,
-  int8_t type,
+  int8_t color,
   int8_t scan_mode,
   int8_t block_type,
-  int8_t tr_depth,
   uint16_t cbf,
-  uint8_t lfnst_idx)
+  uint8_t lfnst_idx, uint8_t mts_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   cabac_data_t * const cabac = &state->cabac;
-  uint32_t log2_tr_width      = uvg_math_floor_log2( height );
-  uint32_t log2_tr_height      = uvg_math_floor_log2( width );
-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
+  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
-  const uint32_t log2_block_size   = uvg_g_convert_to_bit[ width ] + 2;
-  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + type;
-
-  int32_t qp_scaled = uvg_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   
-  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
+  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color;
 
-  const double lambda = type ? state->c_lambda : state->lambda;
+  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  
+  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift - needs_block_size_trafo_scale;
 
-  const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
-  const double *err_scale     = encoder->scaling_list.error_scale[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
+  const double lambda = color ? state->c_lambda : state->lambda;
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+  const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
+
+  const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
+  const double *err_scale     = encoder->scaling_list.error_scale[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
 
   double block_uncoded_cost = 0;
   
@@ -1415,14 +1494,19 @@ void uvg_rdoq(
 
   memset(dest_coeff, 0, sizeof(coeff_t) * width * height);
 
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t log2_cg_width  = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
+  const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
 
-  const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
+  const uint32_t cg_width  = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
+  const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
+
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
   const uint32_t cg_size = 16;
   const int32_t  shift = 4 >> 1;
-  const uint32_t num_blk_side = width >> shift;
+  const uint32_t num_blk_side = MAX(width >> shift, 1);
   double   cost_coeffgroup_sig[ 64 ];
   uint32_t sig_coeffgroup_flag[ 64 ];
 
@@ -1431,26 +1515,34 @@ void uvg_rdoq(
   int32_t temp_diag = -1;
   int32_t temp_sum = -1;
 
-  const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
-
   int32_t cg_last_scanpos = -1;
   int32_t last_scanpos = -1;
 
-  uint32_t cg_num = width * height >> 4;
+  uint32_t       cg_num          = lfnst_idx > 0 ? 1 : width * height >> 4;
+
+  double         dTransShift = (double)transform_shift + (needs_block_size_trafo_scale ? -0.5 : 0.0);
+  // Compensate for scaling of bitcount in Lagrange cost function
+  double scale       = CTX_FRAC_ONE_BIT;
+  // Compensate for scaling through forward transform
+  scale              = scale * pow(2.0, -2.0 * dTransShift);
+  const double  default_error_scale = scale / default_quant_coeff / default_quant_coeff;
 
   // Explicitly tell the only possible numbers of elements to be zeroed.
   // Hope the compiler is able to utilize this information.
   switch (cg_num) {
     case  1: FILL_ARRAY(sig_coeffgroup_flag, 0,  1); break;
+    case  2: FILL_ARRAY(sig_coeffgroup_flag, 0,  2); break;
     case  4: FILL_ARRAY(sig_coeffgroup_flag, 0,  4); break;
+    case  8: FILL_ARRAY(sig_coeffgroup_flag, 0,  8); break;
     case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
+    case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); break;
     case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
-    default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
+    default: assert(0 && "There should be 1, 2, 4, 8, 16, 32 or 64 coefficient groups");
   }
 
-  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[type ? 2 : 0]);
-  cabac_ctx_t *baseCtx              = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
-  cabac_ctx_t* base_gt1_ctx = (type == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[color ? 2 : 0]);
+  cabac_ctx_t *baseCtx              = (color == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
+  cabac_ctx_t* base_gt1_ctx = (color == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
 
   struct {
     double coded_level_and_dist;
@@ -1462,22 +1554,27 @@ void uvg_rdoq(
 
   //Find last cg and last scanpos
   const int max_lfnst_pos = ((height == 4 && width == 4) || (height == 8 && width == 8)) ? 7 : 15;
-  int32_t cg_scanpos;
+  int32_t   cg_scanpos;
+  uint32_t  max_scan_group_size = lfnst_idx > 0 ? max_lfnst_pos : cg_size - 1;
   for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
   {
-    for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
+    uint32_t cg_blkpos = scan_cg[cg_scanpos];
+    uint32_t cg_pos_y = cg_blkpos / num_blk_side;
+    uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side);
+    if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
+    for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)
     {
       int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
-      if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
+      
       uint32_t blkpos         = scan[scanpos];
-      int32_t q               = quant_coeff[blkpos];
+      int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
       int32_t level_double    = coef[blkpos];
       level_double            = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
       uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
 
       double err = (double)level_double;
 
-      cost_coeff0[scanpos] = err * err * err_scale[blkpos];      
+      cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);      
       
       dest_coeff[blkpos] = max_abs_level;
       if (max_abs_level > 0) {
@@ -1507,43 +1604,45 @@ void uvg_rdoq(
     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
 
     FILL(rd_stats, 0);
-    for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
+    if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
+    for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)  {
       int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
       if (scanpos > last_scanpos) {
         continue;
       }
       uint32_t blkpos         = scan[scanpos];
-      int32_t q               = quant_coeff[blkpos];
-      double temp             = err_scale[blkpos];
+      int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
+      double temp             = (use_scaling_list ? err_scale[blkpos] : default_error_scale);
       int32_t level_double    = coef[blkpos];
       level_double            = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
       uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
       dest_coeff[blkpos] = max_abs_level;
       double err = (double)level_double;
 
-      cost_coeff0[scanpos] = err * err * err_scale[blkpos];
+      cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);
 
       block_uncoded_cost      += cost_coeff0[ scanpos ];
 
       if (last_scanpos >= 0) {
 
-        uint32_t  pos_y = blkpos >> log2_block_size;
-        uint32_t  pos_x = blkpos - (pos_y << log2_block_size);
+        uint32_t  pos_y = blkpos >> log2_block_width;
+        uint32_t  pos_x = blkpos - (pos_y << log2_block_width);
         //===== coefficient level estimation =====
         int32_t  level;
         
         uint16_t ctx_sig = 0;
         if (scanpos != last_scanpos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, type, &temp_diag, &temp_sum);
+          // VVC document 9.3.4.2.8, context for sig_coeff_flag calculated here
+          ctx_sig = context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum, mts_idx);
         }
         
         if (temp_diag != -1) {
-          ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((type == 0) ? 15 : 5) : (type == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
+          ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((color == 0) ? 15 : 5) : (color == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
         }
         else ctx_set = 0;
 
         if (reg_bins < 4) {
-          int  sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height);
+          int  sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height,mts_idx);
           go_rice_param = g_auiGoRiceParsCoeff[sumAll];
         }
 
@@ -1554,12 +1653,12 @@ void uvg_rdoq(
         if (scanpos == last_scanpos) {
           level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
             level_double, max_abs_level, 0, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
-            reg_bins, q_bits, temp, 1, type);          
+            reg_bins, q_bits, temp, 1, color);          
         }
         else {
           level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
             level_double, max_abs_level, ctx_sig, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
-            reg_bins, q_bits, temp, 0, type);
+            reg_bins, q_bits, temp, 0, color);
           if (encoder->cfg.signhide_enable) {
             int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1);
             int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0);
@@ -1572,14 +1671,14 @@ void uvg_rdoq(
         if (encoder->cfg.signhide_enable) {
           sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
           if (level > 0) {
-            int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
-            sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
-            sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
+            int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
+            sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
+            sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
           }
           else { // level == 0
             if (reg_bins < 4) {
-              int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
-              sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
+              int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
+              sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
             }
             else {
               sh_rates.inc[blkpos] = CTX_ENTROPY_BITS(&base_gt1_ctx[gt1_ctx], 0);
@@ -1595,7 +1694,7 @@ void uvg_rdoq(
         }
         else if (reg_bins >= 4) {
           reg_bins -= (level < 2 ? level : 3) + (scanpos != last_scanpos);
-          int  sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height);
+          int  sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height, mts_idx);
           go_rice_param = g_auiGoRiceParsCoeff[sumAll];
         }
       }
@@ -1620,7 +1719,7 @@ void uvg_rdoq(
     if( cg_scanpos ) {
       if (sig_coeffgroup_flag[cg_blkpos] == 0) {
         uint32_t ctx_sig  = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                        cg_pos_y, cg_width);
+                                                        cg_pos_y, cg_width, cg_height);
         cost_coeffgroup_sig[cg_scanpos] = lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
         base_cost += cost_coeffgroup_sig[cg_scanpos]  - rd_stats.sig_cost;
       } else {
@@ -1636,7 +1735,7 @@ void uvg_rdoq(
 
           // add SigCoeffGroupFlag cost to total cost
           ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-            cg_pos_y, cg_width);
+            cg_pos_y, cg_width, cg_height);
 
           cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
           base_cost += cost_coeffgroup_sig[cg_scanpos];
@@ -1656,7 +1755,7 @@ void uvg_rdoq(
             cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
 
             // reset coeffs to 0 in this block
-            for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+            for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
               int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
               uint32_t blkpos = scan[scanpos];
               if (dest_coeff[blkpos]){
@@ -1679,12 +1778,12 @@ void uvg_rdoq(
   int8_t found_last        = 0;
   int32_t best_last_idx_p1 = 0;
 
-  if( block_type != CU_INTRA && !type ) {
+  if( block_type != CU_INTRA && !color ) {
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
   } else {
     cabac_ctx_t* base_cbf_model = NULL;
-    switch (type) {
+    switch (color) {
       case COLOR_Y:
         base_cbf_model = cabac->ctx.qt_cbf_model_luma;
         break;
@@ -1697,25 +1796,26 @@ void uvg_rdoq(
       default:
         assert(0);
     }
-    ctx_cbf    = ( type != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
+    // This cbf should work even with non-square blocks
+    ctx_cbf    = ( color != COLOR_V ? 0 : cbf_is_set(cbf, COLOR_U));
     best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
     base_cost +=   lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
   }
 
-  calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
+  calc_last_bits(state, width, height, color, last_x_bits, last_y_bits);
   for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
     uint32_t cg_blkpos = scan_cg[cg_scanpos];
     base_cost -= cost_coeffgroup_sig[cg_scanpos];
 
     if (sig_coeffgroup_flag[ cg_blkpos ]) {
-      for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+      for ( int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
         int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
         if (scanpos > last_scanpos) continue;
         uint32_t blkpos  = scan[scanpos];
 
         if( dest_coeff[ blkpos ] ) {
-          uint32_t   pos_y = blkpos >> log2_block_size;
-          uint32_t   pos_x = blkpos - ( pos_y << log2_block_size );
+          uint32_t   pos_y = blkpos >> log2_block_width;
+          uint32_t   pos_x = blkpos - ( pos_y << log2_block_width );
 
           double cost_last = get_rate_last(lambda, pos_x, pos_y, last_x_bits,last_y_bits );
           double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
@@ -1739,11 +1839,23 @@ void uvg_rdoq(
   } // end for
 
   uint32_t abs_sum = 0;
-  for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
-    int32_t blkPos     = scan[scanpos];
-    int32_t level      = dest_coeff[blkPos];
-    abs_sum            += level;
-    dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
+  if(!mts_idx || (width < 32 && height < 32)) {
+    for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
+      int32_t blkPos     = scan[scanpos];
+      int32_t level      = dest_coeff[blkPos];
+      abs_sum            += level;
+      dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
+    }
+  }
+  else {
+    for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
+      int32_t blkPos     = scan[scanpos];
+      int32_t blk_x = blkPos & (width - 1);
+      int32_t blk_y = blkPos >> log2_block_width;
+      int32_t level      = blk_x >= 16 || blk_y >= 16 ? 0 : dest_coeff[blkPos];
+      abs_sum            += level;
+      dest_coeff[blkPos] = (coeff_t)(( level < 0 ) ? -level : level);
+    }
   }
   //===== clean uncoded coefficients =====
   for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
@@ -1751,7 +1863,7 @@ void uvg_rdoq(
   }
 
   if (encoder->cfg.signhide_enable && abs_sum >= 2) {
-    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, type);
+    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale);
   }
 }
 
diff --git a/src/rdo.h b/src/rdo.h
index 7f325cfd..2ba0c2a9 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -44,6 +44,8 @@
 #include "global.h" // IWYU pragma: keep
 #include "search_inter.h"
 
+#define QUANT_SHIFT 14
+#define IQUANT_SHIFT 6
 
 extern const uint32_t uvg_g_go_rice_range[5];
 extern const uint32_t uvg_g_go_rice_prefix_len[5];
@@ -60,9 +62,8 @@ void  uvg_rdoq(
   int8_t type,
   int8_t scan_mode,
   int8_t block_type,
-  int8_t tr_depth,
   uint16_t cbf,
-  uint8_t lfnst_idx);
+  uint8_t lfnst_idx, uint8_t mts_idx);
 
 
 int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_coeff, int32_t width,
@@ -73,10 +74,11 @@ double uvg_get_coeff_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   cu_info_t* cur_tu,
-  int32_t width,
+  const cu_loc_t* const cu_loc,
   color_t color,
   int8_t scan_mode,
-  int8_t tr_skip);
+  int8_t tr_skip,
+  int coeff_order);
 
 int32_t uvg_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_gt1, uint16_t ctx_num_gt2, uint16_t ctx_num_par,
                     uint16_t abs_go_rice, uint32_t reg_bins, int8_t type, int use_limited_prefix_length);
diff --git a/src/scalinglist.c b/src/scalinglist.c
index 5c32ac4c..01edfa27 100644
--- a/src/scalinglist.c
+++ b/src/scalinglist.c
@@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] =
   24, 25, 28, 33, 41, 54, 71, 91
 };
 
-const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564};
-const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72};
+const int16_t uvg_g_quant_scales[2][6] = {
+  {26214, 23302, 20560, 18396, 16384, 14564},
+    { 18396,16384,14564,13107,11651,10280 }
+};
+const int16_t uvg_g_inv_quant_scales[2][6] = {
+  {40, 45, 51, 57, 64, 72},
+  { 57,64,72,80,90,102 }
+};
 
 
 /**
@@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons
   int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp];
   int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp];
 
-  // Encoder list
-  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio,
+  // Encoder list TODO: the sqrt adjusted lists
+  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio,
                               MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
   // Decoder list
-  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio,
+  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio,
                           MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
 
 
diff --git a/src/search.c b/src/search.c
index cb9fc1d1..c353a914 100644
--- a/src/search.c
+++ b/src/search.c
@@ -36,11 +36,14 @@
 #include <string.h>
 
 #include "cabac.h"
+#include "cu.h"
 #include "encoder.h"
 #include "encode_coding_tree.h"
+#include "filter.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
+#include "rate_control.h"
 #include "uvg266.h"
 #include "rdo.h"
 #include "search_inter.h"
@@ -62,92 +65,247 @@
 static const int INTRA_THRESHOLD = 8;
 
 
-static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
+static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu_loc, enum uvg_tree_type
+                                tree_type)
 {
-  for   (int y = y_local; y < y_local + width; y += SCU_WIDTH) {
-    for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
+  const int y_limit = (cu_loc->local_y + cu_loc->height);
+  const int x_limit = (cu_loc->local_x + cu_loc->width);
+  for   (int y = cu_loc->local_y ; y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x ; x < x_limit; x += SCU_WIDTH) {
       *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
     }
   }
 }
 
-static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, enum uvg_tree_type
-                                  tree_type)
+
+static INLINE void initialize_partial_work_tree(
+  const encoder_state_t* const state,
+  lcu_t* from,
+  lcu_t *to,
+  const cu_loc_t * const cu_loc,
+  const cu_loc_t* const
+  chroma_loc,
+  const enum uvg_tree_type tree_type) {
+
+  const int y_limit = MIN(LCU_WIDTH,  state->tile->frame->height - cu_loc->y / 64 * 64);
+  const int x_limit = MIN(LCU_WIDTH, state->tile->frame->width - cu_loc->x / 64 * 64);
+
+  if (cu_loc->local_x == 0) {
+    to->left_ref = from->left_ref;
+    *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);
+  }
+  else {
+    if(tree_type != UVG_CHROMA_T) {
+      uvg_pixels_blit(from->rec.y, to->rec.y, cu_loc->local_x, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+    }
+    if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
+      uvg_pixels_blit(from->rec.u, to->rec.u, chroma_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(from->rec.v, to->rec.v, chroma_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+    }
+  }
+
+  if (cu_loc->local_y == 0) {
+    to->top_ref = from->top_ref;
+    *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);
+  }
+  else {
+    if (tree_type != UVG_CHROMA_T) {
+      uvg_pixels_blit(&from->rec.y[cu_loc->local_x], &to->rec.y[cu_loc->local_x], 
+        LCU_WIDTH - cu_loc->local_x, cu_loc->local_y,
+        LCU_WIDTH, LCU_WIDTH);
+    }
+    if (tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
+      uvg_pixels_blit(&from->rec.u[chroma_loc->local_x / 2], &to->rec.u[chroma_loc->local_x / 2],
+        LCU_WIDTH_C - chroma_loc->local_x / 2, chroma_loc->local_y / 2,
+        LCU_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(&from->rec.v[chroma_loc->local_x / 2], &to->rec.v[chroma_loc->local_x / 2],
+        LCU_WIDTH_C - chroma_loc->local_x / 2, chroma_loc->local_y / 2,
+        LCU_WIDTH_C, LCU_WIDTH_C);
+    }
+  }
+
+  if (tree_type == UVG_CHROMA_T) {
+    // These are needed for CCLM
+    uvg_pixels_blit(from->rec.y, to->rec.y, MIN(cu_loc->local_x + cu_loc->width * 2, LCU_WIDTH), MIN(cu_loc->local_y + cu_loc->height * 2, LCU_WIDTH), LCU_WIDTH, LCU_WIDTH);
+  }
+
+  to->ref.chroma_format = from->ref.chroma_format;
+  to->rec.chroma_format = from->rec.chroma_format;
+
+  if (tree_type != UVG_CHROMA_T) {
+    const int offset = cu_loc->local_x + cu_loc->local_y * LCU_WIDTH;
+    uvg_pixels_blit(&from->ref.y[offset], &to->ref.y[offset], cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH);
+  }
+
+  if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) {
+    const int offset = chroma_loc->local_x / 2 + chroma_loc->local_y / 2 * LCU_WIDTH_C;
+    uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+  }
+  if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) {
+    for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+      for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t));
+      }
+    }
+    
+  }
+
+  const int y_start = (cu_loc->local_y) - 4;
+  const int x_start = (cu_loc->local_x) - 4;
+  for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+    *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
+  }
+  for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+    *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
+  }
+
+  for (int y = cu_loc->local_y; y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x ; x < x_limit; x += SCU_WIDTH) {
+      memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t));
+    }
+  }
+
+  if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) {
+    const int y_start = (chroma_loc->local_y) - 4;
+    const int x_start = (chroma_loc->local_x) - 4;
+    for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+      *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y);
+    }
+    for (int x = x_start; x < y_limit; x += SCU_WIDTH) {
+      *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start);
+    }
+
+    for(int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+      for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+        if(x >= cu_loc->local_x && y>= cu_loc->local_y) continue;
+        *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
+      }      
+    }
+
+    if (chroma_loc->local_x == 0) {
+      to->left_ref = from->left_ref;
+      *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
+    }
+    if (chroma_loc->local_y == 0) {
+      to->top_ref = from->top_ref;
+      *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from);      
+    }
+    if (x_limit != LCU_WIDTH) {
+      for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
+      }
+    }
+    if (y_limit != LCU_WIDTH) {
+      for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
+      }
+    }
+  }
+  else {
+    if (x_limit != LCU_WIDTH) {
+      for (int y = y_start; y < y_limit; y += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t));
+      }
+    }
+    if (y_limit != LCU_WIDTH) {
+      for (int x = x_start; x < x_limit; x += SCU_WIDTH) {
+        memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t));
+      }
+    }
+  }
+}
+
+static INLINE void copy_cu_pixels(
+  lcu_t *from,
+  lcu_t *to,
+  const cu_loc_t* const cu_loc,
+  enum uvg_tree_type
+  tree_type)
 {
+  const int x_local = cu_loc->local_x;
+  const int y_local = cu_loc->local_y;
   const int luma_index = x_local + y_local * LCU_WIDTH;
-  const int chroma_index = tree_type == UVG_CHROMA_T ? x_local + y_local * LCU_WIDTH_C : (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;
+  const int chroma_index =  (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;
 
   if(tree_type != UVG_CHROMA_T) {
     uvg_pixels_blit(&from->rec.y[luma_index], &to->rec.y[luma_index],
-                    width, width, LCU_WIDTH, LCU_WIDTH);
+                    cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH);
   }
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
     uvg_pixels_blit(&from->rec.u[chroma_index], &to->rec.u[chroma_index],
-                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
     uvg_pixels_blit(&from->rec.v[chroma_index], &to->rec.v[chroma_index],
-                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
 
-static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint, enum
+// ISP_TODO: this needs to work with the new coeff cu orderr
+static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum
                                   uvg_tree_type tree_type)
 {
   if (tree_type != UVG_CHROMA_T) {
-    const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
-    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width);
+    //const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y);
+    const int idx = (cu_loc->x % LCU_WIDTH) + ((cu_loc->y % LCU_WIDTH) * LCU_WIDTH);
+    copy_coeffs(&from->coeff.y[idx], &to->coeff.y[idx], cu_loc->width, cu_loc->height, LCU_WIDTH);
+    
   }
 
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> (tree_type != UVG_CHROMA_T), y_local >> (tree_type != UVG_CHROMA_T));
-    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1);
-    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1);
+    //const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
+    const int chroma_x = (cu_loc->x >> 1);
+    const int chroma_y = (cu_loc->y >> 1);
+
+    const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C);
+    copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
+    copy_coeffs(&from->coeff.v[idx], &to->coeff.v[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     if (joint) {
-      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1);
+      copy_coeffs(&from->coeff.joint_uv[idx], &to->coeff.joint_uv[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
     }
   }
 }
 
+
+static void lcu_fill_chroma_cu_info(lcu_t* lcu, const cu_loc_t* const cu_loc);
 /**
  * Copy all non-reference CU data from next level to current level.
  */
-static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint, enum
-                              uvg_tree_type tree_type)
+static void work_tree_copy_up(
+  lcu_t *from,
+  lcu_t* to,
+  bool joint,
+  enum
+  uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc)
 {
-  const int width = LCU_WIDTH >> depth;
-  copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
-  copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
-  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
+  copy_cu_info  (from, to, cu_loc, tree_type);
+  copy_cu_pixels(from, to, cu_loc, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
+  copy_cu_coeffs(cu_loc, from, to, joint, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type);
+  if (chroma_loc && tree_type != UVG_LUMA_T) {
+    copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T);
+    copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T);
+
+    for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += 4) {
+      for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += 4) {
+        cu_info_t* to_cu = LCU_GET_CU_AT_PX(to, x, y);
+        cu_info_t* from_cu = LCU_GET_CU_AT_PX(from, x, y);
+        to_cu->intra.mode_chroma = from_cu->intra.mode_chroma;
+        to_cu->joint_cb_cr = from_cu->joint_cb_cr;
+        to_cu->cr_lfnst_idx = from_cu->cr_lfnst_idx;
+        to_cu->chroma_deblocking = from_cu->chroma_deblocking;
+        to_cu->log2_chroma_width = from_cu->log2_chroma_width;
+        to_cu->log2_chroma_height = from_cu->log2_chroma_height;
+
+        cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_U);
+        cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_V);
+      }
+    }
+  }
   
 }
 
 
-/**
- * Copy all non-reference CU data from current level to all lower levels.
- */
-static void work_tree_copy_down(int x_local, int y_local, int depth, lcu_t *work_tree, enum uvg_tree_type
-                                tree_type)
-{
-  const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> 1;
-  for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) {
-    copy_cu_info  (x_local, y_local, width, &work_tree[depth], &work_tree[i]);
-    copy_cu_pixels(x_local, y_local, LCU_WIDTH >> depth, &work_tree[depth], &work_tree[i], tree_type);
-  }
-}
-
-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type)
-{
-  const int x_local = SUB_SCU(x_px);
-  const int y_local = SUB_SCU(y_px);
-  const unsigned width = (tree_type != UVG_CHROMA_T ? LCU_WIDTH  : LCU_WIDTH_C) >> depth;
-
-  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
-    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
-      LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth;
-    }
-  }
-}
-
 static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, int height, const cu_info_t *cu)
 {
   // Set mode in every CU covered by part_mode in this depth.
@@ -155,21 +313,29 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
     for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
       cu_info_t *to = LCU_GET_CU_AT_PX(lcu, x, y);
       to->type      = cu->type;
-      to->depth     = cu->depth;
-      to->part_size = cu->part_size;
       to->qp        = cu->qp;
+      to->split_tree = cu->split_tree;
       //to->tr_idx    = cu->tr_idx;
       to->lfnst_idx = cu->lfnst_idx;
+      to->cr_lfnst_idx = cu->cr_lfnst_idx;
+      to->joint_cb_cr = cu->joint_cb_cr;
       to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos;
       to->violates_lfnst_constrained_luma = cu->violates_lfnst_constrained_luma;
       to->violates_lfnst_constrained_chroma = cu->violates_lfnst_constrained_chroma;
 
+      to->log2_height = cu->log2_height;
+      to->log2_width = cu->log2_width;
+
+      to->log2_chroma_height = cu->log2_chroma_height;
+      to->log2_chroma_width = cu->log2_chroma_width;
+
       if (cu->type == CU_INTRA) {
         to->intra.mode        = cu->intra.mode;
         to->intra.mode_chroma = cu->intra.mode_chroma;
         to->intra.multi_ref_idx = cu->intra.multi_ref_idx;
         to->intra.mip_flag = cu->intra.mip_flag;
         to->intra.mip_is_transposed = cu->intra.mip_is_transposed;
+        to->intra.isp_mode = cu->intra.isp_mode;
       } else {
         to->skipped   = cu->skipped;
         to->merged    = cu->merged;
@@ -180,74 +346,105 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
   }
 }
 
-static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width, uint8_t type)
+static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc)
 {
-  const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size;
-  const int num_pu = uvg_part_mode_num_parts[part_mode];
+  // The bottom right cu will always have the chroma info
+  cu_info_t *bottom_right = LCU_GET_CU_AT_PX(
+    lcu,
+    cu_loc->local_x + cu_loc->width - 1,
+    cu_loc->local_y + cu_loc->height - 1);
+  if(bottom_right->type != CU_INTRA) return;
 
-  for (int i = 0; i < num_pu; ++i) {
-    const int x_pu      = PU_GET_X(part_mode, cu_width, x_local, i);
-    const int y_pu      = PU_GET_Y(part_mode, cu_width, y_local, i);
-    const int width_pu  = PU_GET_W(part_mode, cu_width, i);
-    const int height_pu = PU_GET_H(part_mode, cu_width, i);
 
-    cu_info_t *pu  = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
-    pu->type = type;
-    lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu);
+  for(int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += 4 ) {
+    for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += 4) {
+      cu_info_t *cu         = LCU_GET_CU_AT_PX(lcu, x, y);
+      cu->intra.mode_chroma = bottom_right->intra.mode_chroma;
+      cu->joint_cb_cr       = bottom_right->joint_cb_cr;
+      cu->cr_lfnst_idx      = bottom_right->cr_lfnst_idx;
+      cu->log2_chroma_height = bottom_right->log2_chroma_height;
+      cu->log2_chroma_width = bottom_right->log2_chroma_width;
+      cu->type = bottom_right->type;
+      cu->tr_skip |= bottom_right->tr_skip & 6;
+    }
   }
 }
 
-static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, const cu_info_t *cur_cu)
-{
-  const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth;
-  const uint32_t mask = ~((width >> tr_split)-1);
 
+static void lcu_fill_chroma_cbfs(lcu_t *lcu, const cu_loc_t * const chroma_loc, enum uvg_tree_type tree_type)
+{
+  int8_t height = chroma_loc->height;
+  int8_t width =  chroma_loc->width;
+  uint32_t x_local = chroma_loc->local_x;
+  uint32_t y_local = chroma_loc->local_y;
+  const int offset = ~((TR_MAX_WIDTH) - 1);
   // Set coeff flags in every CU covered by part_mode in this depth.
-  for (uint32_t y = y_local; y < y_local + width; y += SCU_WIDTH) {
-    for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) {
+  for (uint32_t y = 0; y < height; y += SCU_WIDTH) {
+    for (uint32_t x = 0; x < width; x += SCU_WIDTH) {
       // Use TU top-left CU to propagate coeff flags
-      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & mask, y & mask);
-      cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x, y);
+      cu_info_t* cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & offset), y_local + (y & offset));
+      cu_info_t* cu_to = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y);
       if (cu_from != cu_to) {
-        // Chroma and luma coeff data is needed for deblocking
-        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y);
         cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U);
         cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V);
       }
     }
   }
+  
+}
+
+static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu, enum
+                         uvg_tree_type tree_type)
+{
+  // Set coeff flags in every CU covered by part_mode in this depth.
+  for (uint32_t y = 0; y < height; y += SCU_WIDTH) {
+    for (uint32_t x = 0; x < width; x += SCU_WIDTH) {
+      // Use TU top-left CU to propagate coeff flags
+      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & ~(TR_MAX_WIDTH - 1)), y_local + (y & ~(TR_MAX_WIDTH - 1)));
+      cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y);
+      if (cu_from != cu_to) {
+        // Chroma and luma coeff data is needed for deblocking
+        if(tree_type != UVG_CHROMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y);
+        if(tree_type != UVG_LUMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U);
+        if (tree_type != UVG_LUMA_T)cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V);
+      }
+    }
+  }
 }
 
 
 //Calculates cost for all zero coeffs
-static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y,
+static double cu_zero_coeff_cost(
+  const encoder_state_t *state,
+  lcu_t *work_tree,
+  const cu_loc_t* const cu_loc,
   const int depth)
 {
-  int x_local = SUB_SCU(x);
-  int y_local = SUB_SCU(y);
-  int cu_width = LCU_WIDTH >> depth;
   lcu_t *const lcu = &work_tree[depth];
 
+  const int y_local = cu_loc->local_y;
+  const int x_local = cu_loc->local_x;
+
   const int luma_index = y_local * LCU_WIDTH + x_local;
   const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
 
   double ssd = 0.0;
   ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
     &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
-    LCU_WIDTH, LCU_WIDTH, cu_width
+    LCU_WIDTH, LCU_WIDTH, cu_loc->width, cu_loc->height
     );
-  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
+  if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
     ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
       );
     ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
       );
   }
   // Save the pixels at a lower level of the working tree.
-  copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1], UVG_BOTH_T);
+  copy_cu_pixels(lcu, &work_tree[depth + 1], cu_loc, UVG_BOTH_T);
 
   return ssd;
 }
@@ -261,7 +458,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
   const int stride = state->tile->frame->rec->stride;
   const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
 
-  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
+  for (int y_ = 0; y_ < height && y_ * 2 + y < state->tile->frame->height; y_++) {
     for (int x_ = 0; x_ < width; x_++) {
       int s = 4;
       s += y_rec[2 * x_] * 2;
@@ -281,7 +478,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
   if((y + height * 2) % 64 == 0) {
     int line = y / 64 * stride2 / 2;
     y_rec -= LCU_WIDTH;
-    for (int i = 0; i < width; ++i) {
+    for (int i = 0; i < width && i + x / 2 < stride2 / 2; ++i) {
       int s = 2;
       s += y_rec[i * 2] * 2;
       s += y_rec[i * 2 + 1];
@@ -301,71 +498,117 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * Takes into account SSD of reconstruction and the cost of encoding whatever
 * prediction unit data needs to be coded.
 */
-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
-                           const cu_info_t *const pred_cu,
-                           lcu_t *const lcu)
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  uint8_t isp_cbf)
 {
-  const int width = LCU_WIDTH >> depth;
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
-
+  
   // cur_cu is used for TU parameters.
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
 
   double coeff_bits = 0;
   double tr_tree_bits = 0;
 
-  // Check that lcu is not in 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
+  // Check that lcu is not in   
 
-  const uint8_t tr_depth = tr_cu->tr_depth - depth;
-
-  if (tr_depth > 0) {
-    int offset = width / 2;
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
+    // Recursively process sub-CUs.
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      sum += uvg_cu_rd_cost_luma(state, &split_cu_loc[i], pred_cu, lcu, isp_cbf);
+    }
 
     return sum + tr_tree_bits * state->lambda;
   }
 
+  const bool is_not_isp = pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP;
   // Add transform_tree cbf_luma bit cost.
-  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
-  int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
-  if (pred_cu->type == CU_INTRA ||
-      is_tr_split ||
-      cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
-      cbf_is_set(tr_cu->cbf, depth, COLOR_V))
-  {
-    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+  if (is_not_isp) {
+    const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+    int is_set = cbf_is_set(pred_cu->cbf, COLOR_Y);
+    if (pred_cu->type == CU_INTRA ||
+      !PU_IS_TU(pred_cu) ||
+      cbf_is_set(tr_cu->cbf, COLOR_U) ||
+      cbf_is_set(tr_cu->cbf, COLOR_V))
+    {
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
 
-    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
+      CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
+    }
+
+    if (is_set && state->encoder_control->cfg.trskip_enable 
+      && cu_loc->width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && cu_loc->height <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
+    }
   }
-
-  if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
-    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
+  else {
+    // TODO: 8x4 CUs
+    const int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, pred_cu->intra.isp_mode, true);
+    int luma_ctx = 2;
+    const int split_limit_minus_one = split_limit - 1;
+    for (int i = 0; i < split_limit; i++) {
+      if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) {
+        const int flag = (isp_cbf >> i) & 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search");
+        luma_ctx = 2 + flag;
+      }
+    }
   }
 
   // SSD between reconstruction and original
   int ssd = 0;
   if (!state->encoder_control->cfg.lossless) {
-    int index = y_px * LCU_WIDTH + x_px;
+    int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
     ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                         LCU_WIDTH,          LCU_WIDTH,
-                                        width);
+                                        cu_loc->width, cu_loc->height);
   }
 
 
   if (!skip_residual_coding) {
-    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
-    const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+    int8_t luma_scan_mode = SCAN_DIAG;
+    if (is_not_isp) {
+      //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+      const coeff_t* coeffs = lcu->coeff.y;
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+    }
+    else {
+      int split_type = pred_cu->intra.isp_mode;
+      int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, split_type, true);
+
+      for (int i = 0; i < split_limit; ++i) {
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y,  cu_loc->width, cu_loc->height, i, split_type, true);
+        const int part_x = split_loc.x;
+        const int part_y = split_loc.y;
+
+        // TODO: maybe just pass the cu_loc_t to these functions
+        //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+        const coeff_t* coeffs = lcu->coeff.y;
+
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+      }
+    }
   }
 
   double bits = tr_tree_bits + coeff_bits;
@@ -373,57 +616,58 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
 }
 
 
-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
-                             cu_info_t *const pred_cu,
-                             lcu_t *const lcu)
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
+  cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  const cu_loc_t * const cu_loc)
 {
-  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
-  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const vector2d_t lcu_px = { (cu_loc->local_x) / 2, (cu_loc->local_y) / 2 };
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
-
+  
   double tr_tree_bits = 0;
   double coeff_bits = 0;
+  
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, COLOR_U);
+  int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, COLOR_V);
 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
-
-  if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) {
-    // For MAX_PU_DEPTH calculate chroma for previous depth for the first
-    // block and return 0 cost for all others.
-    return 0;
-  }
-  int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U);
-  int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V);
-
-  // See luma for why the second condition
-  if (!skip_residual_coding) {
-    const int tr_depth = depth - pred_cu->depth;
-    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
-    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
-    cabac->cur_ctx = ctx;
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
-    }
-    ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
-    }
-  }
-
-
-  if (tr_cu->tr_depth > depth) {
-    int offset = LCU_WIDTH >> (depth + 1);
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
+    // Recursively process sub-CUs.
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
 
-    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc[i]);
+    }
 
     return sum + tr_tree_bits * state->lambda;
   }
+  
+  if (!skip_residual_coding) {
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    cabac->cur_ctx = ctx;
+    CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+    
+    ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+    CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    
+  }
+
+
 
   if (state->encoder_control->cfg.jccr) {
     int cbf_mask = u_is_set * 2 + v_is_set - 1;
@@ -441,23 +685,26 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
     int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
     int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                     LCU_WIDTH_C,         LCU_WIDTH_C,
-                                    width);
+                                    cu_loc->chroma_width, cu_loc->chroma_height);
     int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                     LCU_WIDTH_C,        LCU_WIDTH_C,
-                                    width);
+                                    cu_loc->chroma_width, cu_loc->chroma_height);
     ssd = ssd_u + ssd_v;
   }
 
   if (!skip_residual_coding) {
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+    // We need the rounded & shifted coordinates for the chroma coeff calculation
+    cu_loc_t chroma_loc;
+    uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, cu_loc->width, cu_loc->height);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, 2, scan_order, 0);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU);
       
     }
   }
@@ -470,82 +717,104 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
 
 static double cu_rd_cost_tr_split_accurate(
   const encoder_state_t* const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
   const cu_info_t* const pred_cu,
   lcu_t* const lcu,
-  enum uvg_tree_type tree_type) {
-  const int width = LCU_WIDTH >> depth;
-
+  enum uvg_tree_type tree_type,
+  uint8_t isp_cbf,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
+  bool has_chroma) {
+  const int width = cu_loc->width;
+  const int height = cu_loc->height; // TODO: height for non-square blocks
+  
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   // cur_cu is used for TU parameters.
-  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
 
   double coeff_bits = 0;
-  double tr_tree_bits = 0;
-
-  // Check that lcu is not in 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
-
-  const uint8_t tr_depth = tr_cu->tr_depth - depth;
-
-  const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
-  const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+  double luma_bits = 0;
+  double chroma_bits = 0;
+  
+  const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, COLOR_U);
+  const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, COLOR_V);
 
   cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
 
   {
-    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    int cbf = cbf_is_set_any(tr_cu->cbf);
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
-    if (pred_cu->type != CU_INTRA && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
+    if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, luma_bits, "rqt_root_cbf");
     }
 
   }
-
-  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8)) && tree_type != UVG_LUMA_T;
-  if( !skip_residual_coding && has_chroma) {
-    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
-    } 
-    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
-    } 
-  }
-
-  if (tr_depth > 0) {
-    int offset = LCU_WIDTH >> (depth + 1);
+  
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
     double sum = 0;
-
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type);
-    return sum + tr_tree_bits * state->lambda;
+    enum split_type split;
+    if(cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
+    } else if(cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    } else {
+      split = BT_HOR_SPLIT;
+    }
+    
+    cu_loc_t split_cu_loc[4];
+    const int split_count= uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    cu_loc_t split_chroma_cu_loc[4];
+    if (chroma_loc) {
+      uvg_get_split_locs(chroma_loc, split, split_chroma_cu_loc, NULL);
+    }
+    for (int i = 0; i < split_count; ++i) {
+      sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc[i], chroma_loc ? &split_chroma_cu_loc[i] : NULL, has_chroma);
+    }
+    return sum + luma_bits * state->lambda;
   }
-  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
 
+  has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && has_chroma && tree_type != UVG_LUMA_T;
+  if (!skip_residual_coding && has_chroma) {
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, chroma_bits, "cbf_cb");  
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, chroma_bits, "cbf_cr");    
+  }
+
+  const int cb_flag_y = cbf_is_set(tr_cu->cbf, COLOR_Y) && tree_type != UVG_CHROMA_T;
+
+  const bool is_isp = !(pred_cu->type != CU_INTRA || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
   // Add transform_tree cbf_luma bit cost.
-  const int is_tr_split = depth - tr_cu->depth;
-  if ((pred_cu->type == CU_INTRA ||
-    is_tr_split ||
-    cb_flag_u ||
-    cb_flag_v) 
+  if (!is_isp) {
+    const int is_tr_split = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
+    if ((pred_cu->type == CU_INTRA ||
+      is_tr_split ||
+      cb_flag_u ||
+      cb_flag_v)
       && !skip_residual_coding && tree_type != UVG_CHROMA_T)
-  {
-    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    {
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
 
-    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+      CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, luma_bits, "cbf_y_search");
+    }
+  }
+  else {
+    // TODO: 8x4 CUs
+    const int split_limit = uvg_get_isp_split_num(width, height, pred_cu->intra.isp_mode, true);
+    int luma_ctx = 2;
+    const int split_limit_minus_one = split_limit - 1;
+    for (int i = 0; i < split_limit; i++) {
+      if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) {
+        const int flag = (isp_cbf >> i) & 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, luma_bits, "cbf_y_search");
+        luma_ctx = 2 + flag;
+      }
+    }
   }
 
   if (cb_flag_y || cb_flag_u || cb_flag_v) {
     // TODO qp_delta_sign_flag
 
     if ((cb_flag_u || cb_flag_v) && has_chroma && state->encoder_control->cfg.jccr) {
-      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag");
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, chroma_bits, "tu_joint_cbcr_residual_flag");
     }
   }
 
@@ -553,40 +822,66 @@ static double cu_rd_cost_tr_split_accurate(
   // SSD between reconstruction and original
   unsigned luma_ssd = 0;
   if (!state->encoder_control->cfg.lossless && tree_type != UVG_CHROMA_T) {
-    int index = y_px * LCU_WIDTH + x_px;
+    int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
     luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
       LCU_WIDTH, LCU_WIDTH,
-      width);
+      width, height);
   }
   // Chroma transform skip enable/disable is non-normative, so we need to count the chroma
   // tr-skip bits even when we are never using it.
-  const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size);
+  const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable
+                               && width <= (1 << state->encoder_control->cfg.trskip_max_size)
+                               && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+                               && !is_isp;
 
-  if(cb_flag_y){
+  if(cb_flag_y || is_isp){
     if (can_use_tr_skip) {
-      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, luma_bits, "transform_skip_flag");
     }
-    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
-    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+    int8_t luma_scan_mode = SCAN_DIAG;
+    if (pred_cu->type != CU_INTRA || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+      //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+      const coeff_t* coeffs = lcu->coeff.y;
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+    }
+    else {
+      int split_type = pred_cu->intra.isp_mode;
+      int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
+
+      for (int i = 0; i < split_limit; ++i) {
+        cu_loc_t split_loc;
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
+        const int part_x = split_loc.x;
+        const int part_y = split_loc.y;
+
+        // TODO: maybe just pass the cu_loc_t to these functions
+        //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)];
+        const coeff_t* coeffs = lcu->coeff.y;
+
+        coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+      }
+    }
   }
 
-  if(depth == 4 || tree_type == UVG_LUMA_T) {
-    if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, COLOR_Y, lcu)) {
+  const bool is_local_sep_tree = (cu_loc->width != chroma_loc->width || cu_loc->height != chroma_loc->height) && state->encoder_control->chroma_format != UVG_CSP_400;
+
+  if(is_local_sep_tree || tree_type == UVG_LUMA_T) {
+
+    if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc, lcu)) {
       const int lfnst_idx = tr_cu->lfnst_idx;
       CABAC_FBITS_UPDATE(
         cabac,
         &cabac->ctx.lfnst_idx_model[1],
         lfnst_idx != 0,
-        tr_tree_bits,
+        luma_bits,
         "lfnst_idx");
       if (lfnst_idx > 0) {
         CABAC_FBITS_UPDATE(
           cabac,
           &cabac->ctx.lfnst_idx_model[2],
           lfnst_idx == 2,
-          tr_tree_bits,
+          luma_bits,
           "lfnst_idx");
       }
     }
@@ -595,103 +890,106 @@ static double cu_rd_cost_tr_split_accurate(
 
   unsigned chroma_ssd = 0;
   if(has_chroma) {
-    const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
-    const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
-    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+    cu_loc_t temp_chroma_loc;
+    const vector2d_t lcu_px = { chroma_loc->local_x >> 1, chroma_loc->local_y >> 1};
+    uvg_cu_loc_ctor(&temp_chroma_loc, lcu_px.x, lcu_px.y, chroma_loc->width, chroma_loc->height);
+    const int chroma_width  = chroma_loc->chroma_width;
+    const int chroma_height = chroma_loc->chroma_height; 
+    int8_t scan_order = SCAN_DIAG;
+    //const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
-    const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size);
+    const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable
+      && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && chroma_height <= (1 << state->encoder_control->cfg.trskip_max_size);
     if(pred_cu->joint_cb_cr == 0) {
       if (!state->encoder_control->cfg.lossless) {
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
         unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[1];
         unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[2];
         chroma_ssd = ssd_u + ssd_v;
       }
       if(chroma_can_use_tr_skip && cb_flag_u) {
-        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");        
+        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, chroma_bits, "transform_skip_flag");        
       }
       if(chroma_can_use_tr_skip && cb_flag_v) {
-        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
+        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, chroma_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4);
+      chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
+      chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &temp_chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
       
     }
     else {
       {
         int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
         int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[3];
         int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
-          LCU_WIDTH_C, LCU_WIDTH_C,
-          chroma_width);
+          LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[3];
         chroma_ssd = ssd_u_joint + ssd_v_joint;
       }
       if (chroma_can_use_tr_skip) {
-        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
+        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, chroma_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, COLOR_U, scan_order, 0);
+      chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
     }
   }
 
-  if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
-    const int lfnst_idx = (depth != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
+  const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T;
+  if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? chroma_loc : cu_loc, lcu) && tree_type != UVG_LUMA_T) {
+    const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx;
     CABAC_FBITS_UPDATE(
       cabac,
-      &cabac->ctx.lfnst_idx_model[tr_cu->depth == 4 || tree_type != UVG_BOTH_T],
+      &cabac->ctx.lfnst_idx_model[is_chroma_tree],
       lfnst_idx != 0,
-      tr_tree_bits,
+      luma_bits,
       "lfnst_idx");
     if (lfnst_idx > 0) {
       CABAC_FBITS_UPDATE(
         cabac,
         &cabac->ctx.lfnst_idx_model[2],
         lfnst_idx == 2,
-        tr_tree_bits,
+        luma_bits,
         "lfnst_idx");
     }
   }
   tr_cu->lfnst_last_scan_pos = false;
   tr_cu->violates_lfnst_constrained_luma = false;
   tr_cu->violates_lfnst_constrained_chroma = false;
-  if (uvg_is_mts_allowed(state, tr_cu) && tree_type != UVG_CHROMA_T) {
+  if (uvg_is_mts_allowed(state, tr_cu, cu_loc) && tree_type != UVG_CHROMA_T) {
 
     bool symbol = tr_cu->tr_idx != 0;
     int ctx_idx = 0;
-    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, tr_tree_bits, "mts_idx");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, luma_bits, "mts_idx");
 
     ctx_idx++;
     for (int i = 0; i < 3 && symbol; i++, ctx_idx++)
     {
       symbol = tr_cu->tr_idx > i + MTS_DST7_DST7 ? 1 : 0;
-      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, tr_tree_bits, "mts_idx");
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, luma_bits, "mts_idx");
     }
     tr_cu->mts_last_scan_pos = false;
     tr_cu->violates_mts_coeff_constraint = false;
   }
 
-  double bits = tr_tree_bits + coeff_bits;
-  return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + bits * state->lambda;
+  double bits = luma_bits + coeff_bits;
+  return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT  + (bits + chroma_bits) * state->lambda;
 }
 
 
 // Return estimate of bits used to code prediction mode of cur_cu.
-static double calc_mode_bits(const encoder_state_t *state,
-                             const lcu_t *lcu,
-                             const cu_info_t * cur_cu,
-                             int x, int y, int depth)
+static double calc_mode_bits(
+  const encoder_state_t *state,
+  const lcu_t *lcu,
+  const cu_info_t * cur_cu,
+  const cu_loc_t* const cu_loc)
 {
   assert(cur_cu->type == CU_INTRA);
 
-  double mode_bits = uvg_luma_mode_bits(state, cur_cu, x, y, depth, lcu);
+  double mode_bits = uvg_luma_mode_bits(state, cur_cu, cu_loc, lcu);
 
-  if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) {
+  if (((cu_loc->width == 4 && cu_loc->x % 8 && cu_loc->y % 8) || (cu_loc->width != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) {
     mode_bits += uvg_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
   }
 
@@ -768,6 +1066,134 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
 }
 
 
+static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const chroma_loc, lcu_t* lcu, enum uvg_tree_type tree_type, bool has_chroma, const bool is_separate_tree, int x_local, int y_local)
+{
+  if(tree_type != UVG_CHROMA_T) {
+    if(cu_loc->x) {
+      for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += TR_MAX_WIDTH) {
+        for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->luma_deblocking |= EDGE_VER;
+          if(!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
+        }
+      }
+    }
+    else if(cu_loc->width == 64) {
+      for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->luma_deblocking |= EDGE_VER;
+        if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER;
+      }        
+    }
+
+    if(cu_loc->y) {
+      for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += TR_MAX_WIDTH) {
+        for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->luma_deblocking |= EDGE_HOR;
+          if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
+        }
+      }
+    }
+    else if (cu_loc->height == 64) {
+      for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->luma_deblocking |= EDGE_HOR;
+        if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR;
+      }
+    }
+
+    if(is_separate_tree && has_chroma) {
+      if (chroma_loc->x) {
+        for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += TR_MAX_WIDTH) {
+          for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+            LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
+          }
+        }
+      }
+      else if(cu_loc->width == 64) {
+        for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER;
+        }          
+      }
+
+      if (chroma_loc->y) {
+        for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += TR_MAX_WIDTH) {
+          for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+            LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
+          }
+        }
+      }
+      else if (cu_loc->height == 64) {
+        for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR;
+        }
+      }
+    }
+  }
+  else {
+
+    if (chroma_loc->x) {
+      for (int x = x_local; x < x_local + chroma_loc->width; x += TR_MAX_WIDTH) {
+        for (int y = y_local; y < y_local + chroma_loc->height; y += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER;
+        }
+      }
+    }
+    else if(chroma_loc->width == 64) {
+      for (int y = y_local; y < y_local + chroma_loc->height; y += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER;
+      }        
+    }
+
+    if(chroma_loc->y) {
+      for (int y = y_local; y < y_local + chroma_loc->height; y += TR_MAX_WIDTH) {
+        for (int x = x_local; x < x_local + chroma_loc->width; x += SCU_WIDTH) {
+          LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR;
+        }
+      }        
+    }
+    else if (chroma_loc->height == 64) {
+      for (int x = x_local; x < x_local + chroma_loc->width; x += SCU_WIDTH) {
+        LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR;
+      }
+    }
+  }
+}
+
+static bool check_for_early_termission(const int cu_width, const int cu_height, const cu_info_t* const cur_cu, int x_local, int y_local, const
+                                       bool* improved,
+                                       int cbf,
+                                       lcu_t* split_lcu,
+                                       int split_type,
+                                       const bool* can_split)
+{
+  // Best no split has no residual and same direction bt didn't improve so don't try tt
+  // 3.11
+  if (
+    !cbf && ((!improved[BT_VER_SPLIT] && split_type == TT_VER_SPLIT) ||
+             (!improved[BT_HOR_SPLIT] && split_type == TT_HOR_SPLIT)))
+    return true;
+
+
+  // 3.8
+  if (split_type == TT_HOR_SPLIT && can_split[BT_HOR_SPLIT]) {
+    bool can_skip = true;
+    for (int x_scu = x_local; x_scu < x_local + cu_width; x_scu += 4) {
+      can_skip &=
+        LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_scu, y_local)->log2_height == cur_cu->log2_height - 1 &&
+        LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_scu, y_local + cu_height / 2)->log2_height == cur_cu->log2_height - 1;
+    }
+    if (can_skip) return true;
+  }
+  if (split_type == TT_VER_SPLIT && can_split[BT_VER_SPLIT]) {
+    bool can_skip = true;
+    for (int y_scu = y_local; y_scu < y_local + cu_height; y_scu += 4) {
+      can_skip &=
+        LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local, y_scu)->log2_width == cur_cu->log2_width - 1 &&
+        LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local + cu_width / 2, y_scu)->log2_width == cur_cu->log2_width - 1;
+    }
+    if (can_skip) return true;
+  }
+  return false;
+}
+
 /**
  * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode.
  * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH.
@@ -780,17 +1206,24 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
  */
 static double search_cu(
   encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
-  lcu_t* work_tree,
-  enum uvg_tree_type
-  tree_type)
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
+  lcu_t* lcu,
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree,
+  bool has_chroma)
 {
+  const int depth = split_tree.current_depth;
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
-  const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth;
-  const int luma_width = LCU_WIDTH >> depth;
+  const int cu_width = cu_loc->width;
+  const int cu_height =  cu_loc->height;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int luma_width = cu_loc->width;
+  const int luma_height = cu_loc->height;
+  
+  const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width;
   assert(cu_width >= 4);
   double cost = MAX_DOUBLE;
   double inter_zero_coeff_cost = MAX_DOUBLE;
@@ -799,7 +1232,7 @@ static double search_cu(
   cabac_data_t pre_search_cabac;
   memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac));
 
-  const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
+  const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH);
   const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
 
   cu_info_t hmvp_lut[MAX_NUM_HMVP_CANDS];
@@ -815,11 +1248,9 @@ static double search_cu(
     int32_t min;
     int32_t max;
   } pu_depth_inter, pu_depth_intra;
-
-  lcu_t *const lcu = &work_tree[depth];
-
-  int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
-  int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
+  
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
 
   int32_t frame_width = frame->width;
   int32_t frame_height = frame->height;
@@ -841,55 +1272,51 @@ static double search_cu(
     pu_depth_intra.min = ctrl->cfg.pu_depth_intra.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.min[gop_layer] : ctrl->cfg.pu_depth_intra.min[0];
     pu_depth_intra.max = ctrl->cfg.pu_depth_intra.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.max[gop_layer] : ctrl->cfg.pu_depth_intra.max[0];
   }
-  if(tree_type == UVG_CHROMA_T) {
-    pu_depth_intra.max = CLIP(1, 3, pu_depth_intra.max);
-    pu_depth_intra.min = CLIP(1, 3, pu_depth_intra.min);
-  }
+
   pu_depth_inter.min = ctrl->cfg.pu_depth_inter.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.min[gop_layer] : ctrl->cfg.pu_depth_inter.min[0];
   pu_depth_inter.max = ctrl->cfg.pu_depth_inter.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.max[gop_layer] : ctrl->cfg.pu_depth_inter.max[0];
 
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  memset(cur_cu, 0, sizeof(cu_info_t));
   // Assign correct depth
-  cur_cu->depth = (depth > MAX_DEPTH) ? MAX_DEPTH : depth;
-  cur_cu->tr_depth = (depth > 0) ? depth : 1;
   cur_cu->type = CU_NOTSET;
-  cur_cu->part_size = SIZE_2Nx2N;
   cur_cu->qp = state->qp;
-  cur_cu->bdpcmMode = 0;
-  cur_cu->tr_idx = 0;
-  cur_cu->violates_mts_coeff_constraint = 0;
-  cur_cu->mts_last_scan_pos = 0;
-  cur_cu->violates_lfnst_constrained_luma = 0;
-  cur_cu->violates_lfnst_constrained_chroma = 0;
-  cur_cu->lfnst_last_scan_pos = 0;
-  cur_cu->lfnst_idx = 0;
-  cur_cu->joint_cb_cr = 0;
+  cur_cu->split_tree = split_tree.split_tree;
+  cur_cu->log2_width = uvg_g_convert_to_log2[cu_width];
+  cur_cu->log2_height = uvg_g_convert_to_log2[cu_height];
 
+  if(chroma_loc) {
+    cur_cu->log2_chroma_height = uvg_g_convert_to_log2[chroma_loc->chroma_height];
+    cur_cu->log2_chroma_width = uvg_g_convert_to_log2[chroma_loc->chroma_width];
+  }
+
+  intra_search_data_t intra_search = {0};
+
+  const bool completely_inside = x + luma_width <= frame_width && y + luma_height <= frame_height;
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
-  if ( x + luma_width <= frame_width && y + luma_width <= frame_height)
+  if ( completely_inside)
   {
     int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max;
     bool can_use_inter =
       state->frame->slicetype != UVG_SLICE_I &&
-      depth <= MAX_DEPTH &&
+      split_tree.current_depth <= MAX_DEPTH &&
       (
-        WITHIN(depth, pu_depth_inter.min, pu_depth_inter.max) ||
+        WITHIN(split_tree.current_depth, pu_depth_inter.min, pu_depth_inter.max) ||
         // When the split was forced because the CTU is partially outside the
         // frame, we permit inter coding even if pu_depth_inter would
         // otherwise forbid it.
         (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame_width ||
         (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame_height
-      );
+      ) && cu_loc->width == cu_loc->height; // Don't allow non square inter CUs for now
 
     if (can_use_inter) {
       double mode_cost;
       double mode_bitcost;
       uvg_search_cu_inter(state,
-                          x, y,
-                          depth,
-                          lcu,
-                          &mode_cost, &mode_bitcost);
+                          cu_loc, lcu,
+                          &mode_cost,
+                          &mode_bitcost);
       if (mode_cost < cost) {
         cost = mode_cost;
         inter_bitcost = mode_bitcost;
@@ -907,7 +1334,7 @@ static double search_cu(
 
     int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
     bool can_use_intra =
-      (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
+      (WITHIN(split_tree.current_depth, pu_depth_intra.min, pu_depth_intra.max) ||
         // When the split was forced because the CTU is partially outside
         // the frame, we permit intra coding even if pu_depth_intra would
         // otherwise forbid it.
@@ -915,15 +1342,11 @@ static double search_cu(
         (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame_height) &&
       !(state->encoder_control->cfg.force_inter && state->frame->slicetype != UVG_SLICE_I);
 
-    intra_search_data_t intra_search;
     intra_search.cost = 0;
     if (can_use_intra && !skip_intra) {
       intra_search.pred_cu = *cur_cu;
       if(tree_type != UVG_CHROMA_T) {
-        intra_search.pred_cu.joint_cb_cr = 4;
-        uvg_search_cu_intra(state, x, y, depth, &intra_search,
-                            lcu,
-                            tree_type);
+        uvg_search_cu_intra(state, &intra_search, lcu, is_separate_tree ? UVG_LUMA_T : tree_type, cu_loc);
       }
 #ifdef COMPLETE_PRED_MODE_BITS
       // Technically counting these bits would be correct, however counting
@@ -936,72 +1359,80 @@ static double search_cu(
       }
 #endif
       if (state->encoder_control->cfg.cclm && tree_type != UVG_CHROMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-        uvg_intra_recon_cu(state,
-          x, y,
-          depth, &intra_search,
-          &intra_search.pred_cu,
-          lcu, tree_type, true, false);
+        if(intra_search.pred_cu.intra.isp_mode == ISP_MODE_NO_ISP) {
+          uvg_intra_recon_cu(state,
+                             &intra_search, cu_loc,
+                             &intra_search.pred_cu, lcu,
+                             tree_type,
+                             true,
+                             false);
+        }
+        else {
+          cabac_data_t temp_cabac;
+          memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
+          state->search_cabac.update = 1;
+          uvg_recon_and_estimate_cost_isp(
+            state,
+            cu_loc,
+            0,
+            &intra_search,
+            lcu,
+            NULL
+          );
+          memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
+        }
 
         downsample_cclm_rec(
-          state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+          state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
         );
       }
       double intra_cost = intra_search.cost;
       if (intra_cost < cost && tree_type != UVG_LUMA_T) {
         int8_t intra_mode = intra_search.pred_cu.intra.mode;
-
-        // TODO: This heavily relies to square CUs
-        if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
+        
+        if ((has_chroma || tree_type == UVG_CHROMA_T)
+          && state->encoder_control->chroma_format != UVG_CSP_400) {
 
           intra_search.pred_cu.joint_cb_cr = 0;
-          // There is almost no benefit to doing the chroma mode search for
-          // rd2. Possibly because the luma mode search already takes chroma
-          // into account, so there is less of a chanse of luma mode being
-          // really bad for chroma.
-          if(tree_type == UVG_CHROMA_T) {
-            intra_search.pred_cu.intra = uvg_get_co_located_luma_cu(x, y, luma_width, luma_width, NULL, state->tile->frame->cu_array, UVG_CHROMA_T)->intra;
-            intra_mode = intra_search.pred_cu.intra.mode;
+          if(tree_type == UVG_CHROMA_T || is_separate_tree) {
+            intra_mode = uvg_get_co_located_luma_mode(
+                    chroma_loc, cu_loc, &intra_search.pred_cu, is_separate_tree ? lcu : NULL,
+                    tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL,
+                    UVG_CHROMA_T);
+            state->collocated_luma_mode = intra_mode;
             intra_search.pred_cu.type = CU_INTRA;
+          } else  if (intra_search.pred_cu.intra.mip_flag) {
+            intra_mode = 0;
           }
-          intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
+          intra_search.pred_cu.intra.mode_chroma = intra_mode;
           if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
-            uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search, tree_type);
-
-            if (intra_search.pred_cu.joint_cb_cr == 0) {
-              intra_search.pred_cu.joint_cb_cr = 4;
-            }
-
+            uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, intra_mode, tree_type, is_separate_tree);
           }
           else if (!intra_search.pred_cu.intra.mip_flag) {
-            intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
+            intra_search.pred_cu.intra.mode_chroma = intra_mode;
           }
           else {
             intra_search.pred_cu.intra.mode_chroma = 0;
           }
-
-          if(tree_type != UVG_CHROMA_T && ctrl->cfg.rdo >= 2) {
-            uvg_intra_recon_cu(state,
-              x, y,
-              depth, &intra_search,
-              &intra_search.pred_cu,
-              lcu,
-              tree_type, false, true);
-            intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu);
+          state->quant_blocks[2].needs_init = true;
+          uvg_intra_recon_cu(state,
+                             &intra_search, chroma_loc,
+                             &intra_search.pred_cu, lcu,
+                             is_separate_tree ? UVG_CHROMA_T : tree_type,
+                             false,
+                             true);
+          if(tree_type != UVG_CHROMA_T) {
+            intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, chroma_loc);
           }
           else {
             intra_cost = intra_search.cost;
           }
-          intra_search.pred_cu.intra.mode = intra_mode;
           intra_search.pred_cu.violates_lfnst_constrained_chroma = false;
           intra_search.pred_cu.lfnst_last_scan_pos = false;
         }
         else {
           intra_search.pred_cu.intra.mode_chroma = intra_mode;
         }
-        intra_search.pred_cu.intra.mode = intra_mode;
-        if(tree_type == UVG_CHROMA_T) {
-          uvg_lcu_fill_trdepth(lcu, x_local, y_local, depth, depth, tree_type);
-        }
       }
       if (intra_cost < cost) {
         cost = intra_cost;
@@ -1023,8 +1454,7 @@ static double search_cu(
       double mode_cost;
       double mode_bitcost;
       uvg_search_cu_ibc(state,
-                        x, y,
-                        depth,
+                        cu_loc,
                         lcu,
                         &mode_cost, &mode_bitcost);
       if (mode_cost < cost) {
@@ -1041,30 +1471,82 @@ static double search_cu(
     // Reconstruct best mode because we need the reconstructed pixels for
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
-      assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
 
       bool recon_chroma = true;
-      bool recon_luma = tree_type != UVG_CHROMA_T;
-      if ((depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) {
+      bool recon_luma = tree_type != UVG_CHROMA_T && cur_cu->intra.isp_mode == ISP_MODE_NO_ISP;
+      if (is_separate_tree || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T || cu_loc->chroma_height % 4 == 2) {
         recon_chroma = false; 
       }
-      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
-      uvg_intra_recon_cu(state,
-                         x, y,
-                         depth, &intra_search,
-                         NULL, 
-                         lcu, tree_type,recon_luma,recon_chroma);
-      if(depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-        intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
-        uvg_intra_recon_cu(state,
-                           x, y,
-                           depth, &intra_search,
-                           NULL,
-                           lcu,
-                           tree_type,false,true);
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
+      if (!state->encoder_control->cfg.cclm && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+        uvg_recon_and_estimate_cost_isp(
+          state,
+          cu_loc,
+          0,
+          &intra_search,
+          lcu,
+          NULL
+        );
       }
-      if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0;
-      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      else {
+        uvg_intra_recon_cu(state,
+          &intra_search, cu_loc,
+          NULL, lcu,
+          tree_type,
+          recon_luma, recon_chroma);        
+      }
+
+
+      if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) 
+        || tree_type == UVG_CHROMA_T) {
+        intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma;
+        if(tree_type != UVG_CHROMA_T) {
+          lcu_fill_chroma_cu_info(
+            lcu,
+            chroma_loc);
+        }
+        uvg_intra_recon_cu(state,
+                           &intra_search, chroma_loc,
+                           NULL, lcu,
+                           UVG_CHROMA_T,
+                           false,
+                           true);
+        lcu_fill_chroma_cbfs(
+          lcu,
+          chroma_loc,
+          tree_type);
+      } else {
+        assert(cur_cu->cr_lfnst_idx == 0 && "If we don't have separate tree chroma lfnst index must be 0");
+      }
+
+      // Set isp split cbfs here
+      const int split_type = intra_search.pred_cu.intra.isp_mode;
+      const int split_num = split_type == ISP_MODE_NO_ISP || tree_type == UVG_CHROMA_T ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true);
+
+      const int cbf_cb = cbf_is_set(cur_cu->cbf, COLOR_U);
+      const int cbf_cr = cbf_is_set(cur_cu->cbf, COLOR_V);
+      const int jccr = cur_cu->joint_cb_cr;
+      for (int i = 0; i < split_num; ++i) {
+        cu_loc_t isp_loc;
+        uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type, true);
+        // Fetching from CU array does not work for dimensions less than 4
+        // Fetch proper x, y coords for isp blocks
+        int tmp_x = isp_loc.x;
+        int tmp_y = isp_loc.y;
+        uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y, MAX(cu_width, cu_height));
+        cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH);
+        bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1;
+        cbf_clear(&split_cu->cbf, COLOR_Y);
+        cbf_clear(&split_cu->cbf, COLOR_U);
+        cbf_clear(&split_cu->cbf, COLOR_V);
+        if (cur_cbf) {
+          cbf_set(&split_cu->cbf, COLOR_Y);
+        }
+        if(cbf_cb) cbf_set(&split_cu->cbf, COLOR_U);
+        if(cbf_cr) cbf_set(&split_cu->cbf, COLOR_V);
+        split_cu->joint_cb_cr = jccr;
+      }
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
 
 
     } else if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
@@ -1075,35 +1557,28 @@ static double search_cu(
             if (cur_cu->inter.mv_dir & 1) uvg_round_precision(INTERNAL_MV_PREC, 2, &cur_cu->inter.mv[0][0], &cur_cu->inter.mv[0][1]);
             if (cur_cu->inter.mv_dir & 2) uvg_round_precision(INTERNAL_MV_PREC, 2, &cur_cu->inter.mv[1][0], &cur_cu->inter.mv[1][1]);
         }
-        // Reset transform depth because intra messes with them.
-        // This will no longer be necessary if the transform depths are not shared.
-        int tr_depth = MAX(1, depth);
-        if (cur_cu->part_size != SIZE_2Nx2N) {
-          tr_depth = depth + 1;
-        }
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type);
 
         const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-        uvg_inter_recon_cu(state, lcu, x, y, cu_width, true, has_chroma);
+        uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);
 
-        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable && false) {
           //Calculate cost for zero coeffs
-          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
+          // inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;
 
         }
-
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, x, y, cu_width, cu_height);
         uvg_quantize_lcu_residual(state,
                                   true, has_chroma && !cur_cu->joint_cb_cr,
-                                  cur_cu->joint_cb_cr, x, y,
-                                  depth,
+                                  cur_cu->joint_cb_cr, &loc,
                                   NULL,
                                   lcu,
                                   false,
-          tree_type);
+                                  tree_type);
 
-        int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+        int cbf = cbf_is_set_any(cur_cu->cbf);
 
-        if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+        if (cur_cu->merged && !cbf) {
           cur_cu->merged = 0;
           cur_cu->skipped = 1;
           // Selecting skip reduces bits needed to code the CU
@@ -1113,132 +1588,268 @@ static double search_cu(
           inter_bitcost += cur_cu->merge_idx;        
         }
       }
-      lcu_fill_inter(lcu, x_local, y_local, cu_width, cur_cu->type);
-      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
+      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu, UVG_BOTH_T);
     }
   }
+  
+  // The cabac functions assume chroma locations whereas the search uses luma locations
+  // for the chroma tree, therefore we need to shift the chroma coordinates here for
+  // passing to the bit cost calculating functions.
+  cu_loc_t separate_tree_chroma_loc = *cu_loc;
+  separate_tree_chroma_loc.y >>= 1;
+  separate_tree_chroma_loc.x >>= 1;
+  separate_tree_chroma_loc.width >>= 1;
+  separate_tree_chroma_loc.height >>= 1;
 
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     double bits = 0;
     cabac_data_t* cabac  = &state->search_cabac;
     cabac->update = 1;
+    
+    bits += uvg_mock_encode_coding_unit(
+      state,
+      cabac,
+      cu_loc,
+      is_separate_tree && !has_chroma ? NULL : chroma_loc,
+      lcu,
+      cur_cu,
+      tree_type,
+      split_tree);
 
-    if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) {
-      bits += uvg_mock_encode_coding_unit(
-        state,
-        cabac,
-        x, y, depth,
-        lcu,
-        cur_cu,
-        tree_type);
-    }
-    else {
-      assert(0);
-    }
     
     cost = bits * state->lambda;
 
-    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type);
+    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, intra_search.best_isp_cbfs, cu_loc, chroma_loc, has_chroma);
+    //fprintf(stderr, "%4d %4d %2d %2d %d %d %f\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree, cost);
     
-    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
-      cost = inter_zero_coeff_cost;
+    //if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
+    //  cost = inter_zero_coeff_cost;
 
-      // Restore saved pixels from lower level of the working tree.
-      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu, tree_type);
+    //  // Restore saved pixels from lower level of the working tree.
+    //  copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type);
 
-      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
-        cur_cu->merged = 0;
-        cur_cu->skipped = 1;
-        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
-      }
+    //  if (cur_cu->merged) {
+    //    cur_cu->merged = 0;
+    //    cur_cu->skipped = 1;
+    //    lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
+    //  }
 
-      if (cur_cu->tr_depth != depth) {
-        // Reset transform depth since there are no coefficients. This
-        // ensures that CBF is cleared for the whole area of the CU.
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type);
-      }
-
-      cur_cu->cbf = 0;
-      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
-    }
+    //  cur_cu->cbf = 0;
+    //  lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
+    //}
     cabac->update = 0;
+
+    mark_deblocking(
+      cu_loc,
+      chroma_loc,
+      lcu,
+      tree_type,
+      has_chroma,
+      is_separate_tree,
+      x_local,
+      y_local);
+    if (cur_cu->type == CU_INTRA && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP && tree_type != UVG_CHROMA_T) {
+      const int split_num = uvg_get_isp_split_num( cu_width, cu_height, cur_cu->intra.isp_mode,true);
+      for (int i = 1; i < split_num; i++) {
+        cu_loc_t isp_loc;
+        uvg_get_isp_split_loc(
+          &isp_loc,
+          x,
+          y,
+          cu_width,
+          cu_height,
+          i,
+          cur_cu->intra.isp_mode,
+          true);
+        if (isp_loc.x % 4 || isp_loc.y % 4) continue;
+        mark_deblocking(
+          &isp_loc,
+          chroma_loc,
+          lcu,
+          UVG_LUMA_T,
+          false,
+          false,
+          isp_loc.local_x,
+          isp_loc.local_y);
+      }
+    }
   } 
 
   bool can_split_cu =
     // If the CU is partially outside the frame, we need to split it even
     // if pu_depth_intra and pu_depth_inter would not permit it.
     cur_cu->type == CU_NOTSET ||
-    (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) ||
+    (split_tree.current_depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) ||
     (state->frame->slicetype != UVG_SLICE_I &&
-      depth < pu_depth_inter.max);
+      split_tree.current_depth < pu_depth_inter.max);
 
   if(state->encoder_control->cabac_debug_file) {
-    fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, depth, tree_type);
+    fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %9d %d", x, y, split_tree.split_tree, tree_type);
     fwrite(&state->search_cabac.ctx, 1,  sizeof(state->search_cabac.ctx), state->encoder_control->cabac_debug_file);
   }
 
-  // Recursively split all the way to max search depth.
-  if (can_split_cu) {
-    int half_cu = cu_width >> (tree_type != UVG_CHROMA_T);
-    double split_cost = 0.0;
-    int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+  bool can_split[6];
+  bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split);
+
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
+  const int max_btd = state->encoder_control->cfg.max_btt_depth[slice_type];
+  int minimum_split_amount;
+  switch (slice_type) {
+  case 0: minimum_split_amount = pu_depth_intra.min - split_tree.current_depth; break;
+  case 1: minimum_split_amount = MIN(pu_depth_intra.min, pu_depth_inter.min) - split_tree.current_depth; break;
+  case 2: minimum_split_amount = pu_depth_intra.min - split_tree.current_depth; break;
+    default:
+      assert(0 && "Incorrect_slice_type");
+  }
+  if(minimum_split_amount > max_btd && !is_implicit && can_split[1]) {
+    // If search should not be performed at depths that cannot be reached after a maximum mtt split amount
+    // we are in trouble, therefore prevent mtt splits in such situation
+    can_split[2] = can_split[3] = can_split[4] = can_split[5] = false;
+  }
+
+  can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5];
+
+  bool improved[6] = {false};
+
+  // If skip mode was selected for the block, skip further search.
+  // Skip mode means there's no coefficients in the block, so splitting
+  // might not give any better results but takes more time to do.
+  // It is ok to interrupt the search as soon as it is known that
+  // the split costs at least as much as not splitting.
+  int cbf = cbf_is_set_any(cur_cu->cbf);
+
+  // 3.13
+  if ((cu_height < 32 || cu_width < 32) && cur_cu->type != CU_NOTSET  && !cbf && split_tree.mtt_depth > 1 && tree_type != UVG_CHROMA_T) {
+    can_split_cu = false;
+  }
+
+  if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF || true)) {
+    lcu_t * split_lcu = MALLOC(lcu_t, 5);
+    enum split_type best_split = 0;
+    double best_split_cost = MAX_DOUBLE;
     cabac_data_t post_seach_cabac;
+    cabac_data_t best_split_cabac;
     memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
-    memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
+    // Recursively split all the way to max search depth.
+    for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
+      if (!can_split[split_type])
+        continue;
+      split_tree_t new_split = {
+        split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+        split_tree.current_depth + 1,
+        split_tree.mtt_depth + (split_type != QT_SPLIT),
+        split_tree.implicit_mtt_depth + (split_type != QT_SPLIT && is_implicit),
+        0
+      };
+
+      if (completely_inside && check_for_early_termission(
+            cu_width,
+            cu_height,
+            cur_cu,
+            x_local,
+            y_local,
+            improved,
+            cbf,
+            split_lcu,
+            split_type,
+            can_split)) {
+        can_split[split_type] = false;
+        continue;
+      }
+
+      double split_cost = 0.0;
+      memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
 
 
-    double split_bits = 0;
+      double split_bits = 0;
 
-    if (depth < MAX_DEPTH) {
+      if (cur_cu->log2_height + cur_cu->log2_width > 4) {
 
-      state->search_cabac.update = 1;
-      // Add cost of cu_split_flag.
-      const cu_info_t* left_cu = NULL, * above_cu = NULL;
-      if (x) {
-        if (x_local || tree_type != UVG_CHROMA_T) {
-          left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+        state->search_cabac.update = 1;
+        // Add cost of cu_split_flag.
+        const cu_info_t* left_cu = NULL, * above_cu = NULL;
+        if (x) {
+          if (x_local || tree_type != UVG_CHROMA_T) {
+            left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+          }
+          else {
+            left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x - 1, y);
+          }
         }
-        else {
-          left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1);
+        if (y) {
+          if (y_local || tree_type != UVG_CHROMA_T) {
+            above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
+          }
+          else {
+            above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x, y - 1);
+          }
+        }
+        split_tree_t count_tree = split_tree;
+        count_tree.split_tree = split_tree.split_tree | split_type << (split_tree.current_depth * 3);
+        uvg_write_split_flag(
+          state,
+          &state->search_cabac,
+          left_cu,
+          above_cu, 
+          cu_loc,
+          count_tree,
+          tree_type,
+          &is_implicit,
+          &split_bits
+          );
+      }
+
+      // 3.9
+      const double factor    = state->qp > 30 ? 1.1 : 1.075;
+      if (split_bits * state->lambda + cost / factor > cost) {
+        can_split[split_type] = false;
+        continue;
+      }
+
+
+      state->search_cabac.update = 0;
+      split_cost += split_bits * state->lambda;
+
+      // 3.7
+      bool stop_to_qt = false;
+
+      cu_loc_t new_cu_loc[4];
+      uint8_t separate_chroma = 0;
+      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma);
+      separate_chroma |= !has_chroma;
+      initialize_partial_work_tree(state, lcu, &split_lcu[split_type - 1], cu_loc , separate_chroma ? chroma_loc : cu_loc, tree_type);
+      for (int split = 0; split < splits; ++split) {
+        new_split.part_index = split;
+        split_cost += search_cu(state, 
+          &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split],
+          &split_lcu[split_type -1], 
+          tree_type, new_split,
+          !separate_chroma || (split == splits - 1 && has_chroma));
+        // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma
+
+        if (split_type == QT_SPLIT && completely_inside) {
+          const cu_info_t * const t = LCU_GET_CU_AT_PX(
+            &split_lcu[0],
+            new_cu_loc[split].local_x,
+            new_cu_loc[split].local_y);
+          stop_to_qt |= GET_SPLITDATA(t, depth + 1) == QT_SPLIT;
+        }
+
+        if (split_cost > cost || split_cost > best_split_cost) {
+          can_split[split_type] = false;
+          break;
         }
       }
-      if (y) {
-        if (y_local || tree_type != UVG_CHROMA_T) {
-          above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
-        }
-        else {
-          above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1);
-        }
+
+      improved[split_type] = cost > split_cost;
+      
+      if (split_cost < best_split_cost) {
+        best_split_cost = split_cost;
+        best_split = split_type;
+        memcpy(&best_split_cabac, &state->search_cabac, sizeof(cabac_data_t));
       }
-      uvg_write_split_flag(
-        state,
-        &state->search_cabac,
-        left_cu,
-        above_cu,
-        1,
-        depth,
-        cu_width,
-        x >> (tree_type == UVG_CHROMA_T),
-        y >> (tree_type == UVG_CHROMA_T),
-        tree_type,
-        &split_bits);
-    }
-
-    state->search_cabac.update = 0;
-    split_cost += split_bits * state->lambda;
-
-    // If skip mode was selected for the block, skip further search.
-    // Skip mode means there's no coefficients in the block, so splitting
-    // might not give any better results but takes more time to do.
-    // It is ok to interrupt the search as soon as it is known that
-    // the split costs at least as much as not splitting.
-    if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
-      if (split_cost < cost) split_cost += search_cu(state, x,           y,           depth + 1, work_tree, tree_type);
-      if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y,           depth + 1, work_tree, tree_type);
-      if (split_cost < cost) split_cost += search_cu(state, x,           y + half_cu, depth + 1, work_tree, tree_type);
-      if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type);
-    } else {
-      split_cost = INT_MAX;
+      if (stop_to_qt) break;
     }
 
     // If no search is not performed for this depth, try just the best mode
@@ -1253,59 +1864,66 @@ static double search_cu(
       && tree_type == UVG_BOTH_T)
     {
 
-      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
+      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&split_lcu[best_split - 1], x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
-      if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
+      if (cu_d1->type == CU_INTRA && (cu_d1->log2_height + 1 == cur_cu->log2_height || cu_d1->log2_width + 1 == cur_cu->log2_width)) {
         cabac_data_t temp_cabac;
         memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac));
         memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
         cost = 0;
         double bits = 0;
+        bool   is_implicit = false;
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
-                             0, depth, cu_width, x, y, tree_type,
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &is_implicit,
                              &bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
-        cur_cu->part_size = SIZE_2Nx2N;
+        if (cur_cu->intra.mode_chroma > 79) {
+          cur_cu->intra.mode_chroma = cur_cu->intra.mode;
+        }
 
         // Disable MRL in this case
         cur_cu->intra.multi_ref_idx = 0;
         cur_cu->lfnst_idx = 0;
         cur_cu->cr_lfnst_idx = 0;
-
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth, tree_type);
-        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+        
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu);
         
         intra_search_data_t proxy;
         FILL(proxy, 0);
         proxy.pred_cu = *cur_cu;
 
         uvg_intra_recon_cu(state,
-                           x, y,
-                           depth,
-                           &proxy,
+                           &proxy, cu_loc,
                            NULL,
                            lcu,
-                           tree_type, true, state->encoder_control->chroma_format == UVG_CSP_400);
+                           tree_type,
+                           true,
+                           state->encoder_control->chroma_format != UVG_CSP_400);
 
-        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits;
         cost += mode_bits * state->lambda;
 
-        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type);
+        cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma);
+
+        mark_deblocking(cu_loc, chroma_loc, lcu, tree_type, has_chroma, is_separate_tree, x_local, y_local);
 
         memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
         memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
       }
     }
 
-    if (split_cost < cost) {
+    if (best_split_cost < cost) {
       // Copy split modes to this depth.
-      cost = split_cost;
-      work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr, tree_type);
+      cost = best_split_cost;
+      memcpy(&state->search_cabac, &best_split_cabac, sizeof(best_split_cabac));
+      work_tree_copy_up(&split_lcu[best_split -1], lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc, is_separate_tree && !has_chroma ? NULL : chroma_loc);
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
+      );
 #if UVG_DEBUG
       //debug_split = 1;
 #endif
@@ -1313,9 +1931,8 @@ static double search_cu(
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
       memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
-      work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
 
       if (state->frame->slicetype != UVG_SLICE_I) {
@@ -1329,21 +1946,21 @@ static double search_cu(
       }
       // Add candidate when in inter slice or ibc is enabled
       if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) {
-        uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu);
+        uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu);
       }
     }
     else {
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );      
     }
-  } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
+    FREE_POINTER(split_lcu);
+  } else if (cur_cu->log2_height + cur_cu->log2_width > 4) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
-    work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
     if(tree_type != UVG_CHROMA_T) {
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+        state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
     }
 
@@ -1358,7 +1975,7 @@ static double search_cu(
     }
     // Add candidate when in inter slice or ibc is enabled
     if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) {
-      uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu);
+      uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu);
     }
   }
 
@@ -1493,10 +2110,9 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
   // Copy non-reference CUs to picture.
   uvg_cu_array_copy_from_lcu(
     tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, 
-    tree_type != UVG_CHROMA_T ? x_px : x_px / 2,
-    tree_type != UVG_CHROMA_T ? y_px : y_px / 2,
-    lcu, 
-    tree_type);
+    x_px,
+    y_px,
+    lcu);
 
   // Copy pixels to picture.
   {
@@ -1540,30 +2156,34 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   // will use these as temporary storage for predictions before making
   // a decision on which to use, and they get updated during the search
   // process.
-  lcu_t work_tree[MAX_PU_DEPTH + 1];
-  init_lcu_t(state, x, y, &work_tree[0], hor_buf, ver_buf);
-  for (int depth = 1; depth <= MAX_PU_DEPTH; ++depth) {
-    work_tree[depth] = work_tree[0];
-  }
+  lcu_t work_tree;
+  init_lcu_t(state, x, y, &work_tree, hor_buf, ver_buf);
 
   // If the ML depth prediction is enabled, 
   // generate the depth prediction interval 
   // for the current lcu
   constraint_t* constr = state->constraint;
   if (constr->ml_intra_depth_ctu) {
-    uvg_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree[0].ref.y, state->qp);
+    uvg_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree.ref.y, state->qp);
   }
 
   int tree_type = state->frame->slicetype == UVG_SLICE_I
-  && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
+                  && state->encoder_control->cfg.dual_tree
+                    ? UVG_LUMA_T
+                    : UVG_BOTH_T;
+
+  cu_loc_t start;
+  uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
+  split_tree_t split_tree = { 0, 0, 0, 0, 0 };
   // Start search from depth 0.
   double cost = search_cu(
-    state,
-    x,
-    y,
-    0,
-    work_tree,
-    tree_type);
+    state, 
+    &start,
+    &start,
+    &work_tree,
+    tree_type,
+    split_tree,
+    tree_type == UVG_BOTH_T);
 
   // Save squared cost for rate control.
   if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
@@ -1572,29 +2192,28 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.
-  copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type);
+  copy_lcu_to_cu_data(state, x, y, &work_tree, tree_type);
 
   // Copy coeffs to encoder state.
-  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH);
+  copy_coeffs(work_tree.coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
 
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
-      state,
-      x,
-      y,
-      0,
-      work_tree,
-      UVG_CHROMA_T);
+      state, &start,
+      &start,
+      &work_tree, UVG_CHROMA_T,
+      split_tree,
+      true);
 
     if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) {
       uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost;
     }
-    copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T);
+    copy_lcu_to_cu_data(state, x, y, &work_tree, UVG_CHROMA_T);
   }
 
-  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C);
-  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C);
+  copy_coeffs(work_tree.coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree.coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
   if (state->encoder_control->cfg.jccr) {
-    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C);
+    copy_coeffs(work_tree.coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
diff --git a/src/search.h b/src/search.h
index 7566fb96..809a4635 100644
--- a/src/search.h
+++ b/src/search.h
@@ -84,19 +84,17 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);
 
 void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);
 
-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
-                           const cu_info_t *const pred_cu,
-                           lcu_t *const lcu);
-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
-                             cu_info_t *const pred_cu,
-                             lcu_t *const lcu);
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  uint8_t isp_cbf);
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
+  cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  const cu_loc_t * const);
 
-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type);
-
-void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
-void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
 
 #endif
diff --git a/src/search_ibc.c b/src/search_ibc.c
index 44f9ac50..2d80ec28 100644
--- a/src/search_ibc.c
+++ b/src/search_ibc.c
@@ -75,7 +75,8 @@ typedef struct {
    * \brief Possible optimized SAD implementation for the width, leave as
    *        NULL for arbitrary-width blocks
    */
-  optimized_sad_func_ptr_t optimized_sad;
+  optimized_sad_func_ptr_t optimized_sad_y;
+  optimized_sad_func_ptr_t optimized_sad_uv;
 
   lcu_t                   *lcu;
 
@@ -109,8 +110,10 @@ static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x,
 }
 
 
-static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
+  const uint32_t x = loc->x;
+  const uint32_t y = loc->y;
   const int x_scu    = SUB_SCU(x);
   const int y_scu    = SUB_SCU(y);
 
@@ -132,9 +135,11 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
   cur_cu->inter.mv[0][0]                  = mv_x * (1 << INTERNAL_MV_PREC);;
   cur_cu->inter.mv[0][1]                  = mv_y * (1 << INTERNAL_MV_PREC);;
 
-  uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
+  uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
   
   *cur_cu = cu_backup;
+  uint32_t width = loc->width;
+  uint32_t height = loc->height;
 
   cost = uvg_satd_any_size(width,
                            width,
@@ -162,10 +167,15 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
 }
 
 
-static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_sad(ibc_search_info_t *info, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
-  cu_info_t *cur_cu    = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-
+  const uint32_t x = loc->x;
+  const uint32_t y = loc->y;
+  lcu_t         *lcu    = info->lcu;
+  cu_info_t     *cur_cu     = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+  
+  const encoder_state_t* state = info->state;
+  
   cu_info_t cu_backup  = *cur_cu;
   uint32_t       cost  = MAX_INT;
 
@@ -173,6 +183,8 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
   const int y_scu    = SUB_SCU(y);
   const uint32_t offset = x_scu + y_scu * LCU_WIDTH;
   const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+  const uint32_t width = loc->width;
+  const uint32_t height = loc->height;
 
   cur_cu->type    = CU_IBC;
   cur_cu->inter.mv_dir   = 1;
@@ -183,23 +195,26 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
   cur_cu->inter.mv[0][0]                  = mv_x * (1 << INTERNAL_MV_PREC);;
   cur_cu->inter.mv[0][1]                  = mv_y * (1 << INTERNAL_MV_PREC);;
 
-  uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
+  uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
   
   *cur_cu = cu_backup;
 
-  if (optimized_sad != NULL) {
-    cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
-    if(state->encoder_control->chroma_format != UVG_CSP_400) {
-      cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
-      cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
-    }
+  if (info->optimized_sad_y != NULL) {
+    cost = info->optimized_sad_y(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
   } else {
     cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width,width, LCU_WIDTH, state->tile->frame->source->stride);
-    if(state->encoder_control->chroma_format != UVG_CSP_400) {
+  }
+
+  // ToDo: Enable chroma cost calculation
+  /* if (state->encoder_control->chroma_format != UVG_CSP_400) {
+    if (info->optimized_sad_uv != NULL) {    
+      cost += info->optimized_sad_uv(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
+      cost += info->optimized_sad_uv(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
+    } else {
       cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
       cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
     }
-  }
+  }*/
 
   return cost;
 }
@@ -235,8 +250,11 @@ static bool check_mv_cost(ibc_search_info_t *info,
 
   double bitcost = 0;
   double cost    = MAX_DOUBLE;
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height);
 
-  cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y);
+
+  cost = calculate_ibc_cost_sad(info, &loc, x, y);
 
   if (cost >= *best_cost) return false;
 
@@ -246,7 +264,7 @@ static bool check_mv_cost(ibc_search_info_t *info,
       info->mv_cand,
       NULL,
       0,
-      NULL,
+      0,
       &bitcost
   );
 
@@ -782,63 +800,47 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
  * \param amvp        Return searched AMVP PUs sorted by costs
  * \param merge       Return searched Merge PUs sorted by costs
  */
-static void search_pu_ibc(encoder_state_t * const state,
-  int x_cu, int y_cu,
-  int depth,
-  part_mode_t part_mode,
-  int i_pu,
-  unit_stats_map_t *amvp,
-  unit_stats_map_t *merge,
-  ibc_search_info_t *info)
+static void search_pu_ibc(
+  encoder_state_t * const state,
+  const cu_loc_t * const  cu_loc,
+  unit_stats_map_t       *amvp,
+  unit_stats_map_t       *merge,
+  ibc_search_info_t      *info)
 {
-  const uvg_config *cfg = &state->encoder_control->cfg;
-  const videoframe_t * const frame = state->tile->frame;
-  const int width_cu = LCU_WIDTH >> depth;
-  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height = PU_GET_H(part_mode, width_cu, i_pu);
-
-  // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
-  // nRx2N partitions.
-  const bool merge_a1 = i_pu == 0 || width >= height;
-  // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
-  // 2NxnD partitions.
-  const bool merge_b1 = i_pu == 0 || width <= height;
-
+  const uvg_config          *cfg      = &state->encoder_control->cfg;
+  const videoframe_t * const frame    = state->tile->frame;
+  const int                  width_cu = cu_loc->width;
+  const int                  height_cu= cu_loc->height;
 
   lcu_t                     *lcu      = info->lcu;
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
-  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
-  cur_pu->type = CU_IBC;
-  cur_pu->part_size = part_mode;
-  cur_pu->depth = depth;
-  cur_pu->tr_depth = depth;
-  cur_pu->qp = state->qp;
-  cur_pu->inter.mv_dir = 1;
+  const int                  x_local  = SUB_SCU(cu_loc->x);
+  const int                  y_local  = SUB_SCU(cu_loc->y);
+  cu_info_t                 *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  cur_pu->type                        = CU_IBC;
+  cur_pu->qp                          = state->qp;
+  cur_pu->inter.mv_dir                = 1;
 
   // Default to candidate 0
   CU_SET_MV_CAND(cur_pu, 0, 0);
-  
+
   FILL(*info, 0);
 
-  info->state          = state;
-  info->pic            = frame->source;
-  info->origin.x       = x;
-  info->origin.y       = y;
-  info->width          = width;
-  info->height         = height;
-  info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info->optimized_sad  = uvg_get_optimized_sad(width);
-  info->lcu            = lcu;
+  info->state    = state;
+  info->pic      = frame->source;
+  info->origin.x = cu_loc->x;
+  info->origin.y = cu_loc->y;
+  info->width    = width_cu;
+  info->height   = height_cu;
+  info->mvd_cost_func =
+    cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
+  info->optimized_sad_y  = uvg_get_optimized_sad(width_cu);
+  info->optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
+  info->lcu           = lcu;
 
   // Search for merge mode candidates
   info->num_merge_cand = uvg_inter_get_merge_cand(
                           state,
-                          x, y,
-                          width, height,
-                          merge_a1, merge_b1,
+                          cu_loc,
                           info->merge_cand,
                           lcu);
 
@@ -853,7 +855,7 @@ static void search_pu_ibc(encoder_state_t * const state,
 #ifdef COMPLETE_PRED_MODE_BITS
   // Technically counting these bits would be correct, however counting
   // them universally degrades quality so this block is disabled by default
-  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0);
+  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL)], 0);
 #else
   const double no_skip_flag = 0;
 #endif
@@ -875,7 +877,7 @@ static void search_pu_ibc(encoder_state_t * const state,
     {
       continue;
     }
-    uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    uvg_inter_pred_pu(state, info->lcu, true, false, cu_loc);
     merge->unit[merge->size] = *cur_pu;
     merge->unit[merge->size].type = CU_IBC;
     merge->unit[merge->size].merge_idx = merge_idx;
@@ -883,11 +885,11 @@ static void search_pu_ibc(encoder_state_t * const state,
     merge->unit[merge->size].skipped = false;
 
     double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
-    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    if(state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
     }
     else {
-      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+      merge->cost[merge->size] = uvg_satd_any_size(width_cu, height_cu,
         lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
         lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
       bits += no_skip_flag;
@@ -909,7 +911,7 @@ static void search_pu_ibc(encoder_state_t * const state,
     
   // Early Skip Mode Decision
   bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+  if (cfg->early_skip) {
     for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
       if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
         merge->size = 1;
@@ -919,6 +921,7 @@ static void search_pu_ibc(encoder_state_t * const state,
         merge->keys[0] = 0;
       }
       else if(cfg->rdo < 2) {
+        const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
         // Reconstruct blocks with merge candidate.
         // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
         // and chroma exists.
@@ -927,19 +930,18 @@ static void search_pu_ibc(encoder_state_t * const state,
         cur_pu->inter.mv_dir    = info->merge_cand[merge_idx].dir;
         cur_pu->inter.mv[0][0]  = info->merge_cand[merge_idx].mv[0][0];
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
-        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
 
-        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+        if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
           continue;
         }
         else if (has_chroma) {
-          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
           uvg_quantize_lcu_residual(state, false, has_chroma, 
             false, /*we are only checking for lack of coeffs so no need to check jccr*/
-            x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
-          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+            cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
+          if (!cbf_is_set_any(cur_pu->cbf)) {
             cur_pu->type = CU_IBC;
             cur_pu->merge_idx = merge_idx;
             cur_pu->skipped = true;
@@ -964,15 +966,12 @@ static void search_pu_ibc(encoder_state_t * const state,
 
   // Do the motion search
 
-  uvg_inter_get_mv_cand(info->state,
-    info->origin.x,
-    info->origin.y,
-    info->width,
-    info->height,
+  uvg_inter_get_mv_cand(info->state,    
     info->mv_cand,
     cur_pu,
     lcu,
-    NULL);
+    0,
+    cu_loc);
 
   vector2d_t best_mv = { 0, 0 };
 
@@ -1003,9 +1002,7 @@ static void search_pu_ibc(encoder_state_t * const state,
     best_cost = calculate_ibc_cost_satd(
       info->state,
       lcu,
-      info->origin.x,
-      info->origin.y,
-      info->width,
+      cu_loc,
       (best_mv.x >> INTERNAL_MV_PREC),
       (best_mv.y >> INTERNAL_MV_PREC));
     best_cost += best_bits * info->state->lambda;
@@ -1052,16 +1049,16 @@ static void search_pu_ibc(encoder_state_t * const state,
   };
 
 
-  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);    
+  if (state->encoder_control->cfg.rdo >= 2) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);    
   }
 
 
   if(cfg->rdo < 2) {
     int predmode_ctx;
 
-    const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3;
-    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
+    const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
     const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
 
     const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@@ -1077,33 +1074,29 @@ static void search_pu_ibc(encoder_state_t * const state,
 #include "threads.h"
 
 static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
-  int x, int y, int depth,
+  const cu_loc_t* cu_loc,
   lcu_t* lcu,
   double* inter_cost,
   double* inter_bitcost)
 {
-  const int x_cu = x;
-  const int y_cu = y;
+  const int x_cu = cu_loc->x;
+  const int y_cu = cu_loc->y;
   const int part_mode = SIZE_2Nx2N;
   const uvg_config          *cfg      = &state->encoder_control->cfg;
   const videoframe_t * const frame    = state->tile->frame;
-  const int                  width_cu = LCU_WIDTH >> depth;
-  const int                  width    = PU_GET_W(part_mode, width_cu, 0);
-  const int                  height   = PU_GET_H(part_mode, width_cu, 0);
+  const int                  width_cu = cu_loc->width;
+  const int                  height_cu = cu_loc->height;
 
   const bool                 merge_a1  = true;
   const bool                 merge_b1  = true;
 
   ibc_search_info_t info;
 
-  const int  x_local = SUB_SCU(x);
-  const int  y_local = SUB_SCU(y);
+  const int  x_local = SUB_SCU(x_cu);
+  const int  y_local = SUB_SCU(y_cu);
   cu_info_t *cur_pu  = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
 
   cur_pu->type       = CU_IBC;
-  cur_pu->part_size  = part_mode;
-  cur_pu->depth      = depth;
-  cur_pu->tr_depth   = depth;
   cur_pu->qp         = state->qp;
 
   // Default to candidate 0
@@ -1113,24 +1106,20 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
 
   info.state    = state;
   info.pic      = frame->source;
-  info.origin.x = x;
-  info.origin.y = y;
-  info.width    = width;
-  info.height   = height;
+  info.origin.x = cu_loc->x;
+  info.origin.y = cu_loc->y;
+  info.width    = width_cu;
+  info.height   = height_cu;
   info.mvd_cost_func =
     cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info.optimized_sad  = uvg_get_optimized_sad(width);
+  info.optimized_sad_y  = uvg_get_optimized_sad(width_cu);
+  info.optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
   info.lcu            = lcu;
 
   // Search for merge mode candidates
   info.num_merge_cand = uvg_inter_get_merge_cand(
     state,
-    x,
-    y,
-    width,
-    height,
-    merge_a1,
-    merge_b1,
+    cu_loc,
     info.merge_cand,
     lcu);
 
@@ -1145,17 +1134,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   static int    evaluations = 0;
   static int hits = 0;
 
-
-  UVG_CLOCK_T   hashmap_start_temp;
-  UVG_CLOCK_T   hashmap_end_temp;
-
-
   UVG_CLOCK_T   hashmap_start_real_time;
   UVG_CLOCK_T   hashmap_end_real_time;
   UVG_GET_TIME(&hashmap_start_real_time);
 
-  int           xx  = x;
-  int           yy  = y;
+  int           xx  = x_cu;
+  int           yy  = y_cu;
 
   int           best_mv_x    = INT_MAX>>2;
   int           best_mv_y    = INT_MAX>>2;
@@ -1185,12 +1169,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
       int pos_y = result->value & 0xffff;
       int mv_x = pos_x - xx;
       int mv_y = pos_y - yy;
-      if (pos_x <= xx - width && pos_y <= yy - height) {
+      if (pos_x <= xx - width_cu && pos_y <= yy - height_cu) {
         valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y);
         if (valid_mv) {
           bool full_block = true; // Is the full block covered by the IBC?
-          for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
-            for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) {
+          for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width_cu; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
+            for (int offset_y = 0; offset_y < height_cu; offset_y += UVG_HASHMAP_BLOCKSIZE) {
               uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[
                 ((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE];
 
@@ -1211,7 +1195,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
           if (full_block) {
             double     cost = ibc_cost, bits = ibc_bitcost;
             vector2d_t mv = { best_mv_x, best_mv_y};
-            cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, NULL, &bits);
+            cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, 0, &bits);
             //double cost    = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt;
             //cost += 
             bool better_mv = cost < ibc_cost;
@@ -1220,7 +1204,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
               best_mv_y              = mv_y;
               ibc_cost               = cost;
               ibc_bitcost            = bits;
-              fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y);
+              fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x_cu,y_cu, width_cu,height_cu, mv_x, mv_y);
               found_block = true;
               //break;
             }
@@ -1238,7 +1222,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   //if (x > state->tile->frame->width-64 && y > state->tile->frame->height-64)
     //fprintf(stderr, "Hashmap time: %f (crc: %f, search: %f) Evaluations: %d Hits: %d, hashed in this block: %d\n", time_spent,crc_time, search_time, evaluations, hits,hashes_found);
    
-  if (!found_block) return;
+  if (!found_block) return 0;
 
   *inter_cost    = ibc_cost;
   *inter_bitcost = ibc_bitcost;
@@ -1267,18 +1251,16 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
   cur_pu->skipped = merged;
   
 
-  const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
+  const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
   ibc_cost += ibc_flag * state->lambda;
   ibc_bitcost += ibc_flag;
 
   uvg_inter_recon_cu(
     state,
     lcu,
-    x,
-    y,
-    CU_WIDTH_FROM_DEPTH(depth),
     true,
-    state->encoder_control->chroma_format != UVG_CSP_400);
+    state->encoder_control->chroma_format != UVG_CSP_400,
+    cu_loc);
 
   if (*inter_cost < MAX_DOUBLE) {
     assert(fracmv_within_ibc_range(
@@ -1286,7 +1268,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
       cur_pu->inter.mv[0][0],
       cur_pu->inter.mv[0][1]));
   }
-
+  return 1;
 }
 
 
@@ -1305,17 +1287,18 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
  * \param inter_bitcost Return inter bitcost
  */
 void uvg_search_cu_ibc(encoder_state_t * const state,
-                         int x, int y, int depth,
-                         lcu_t *lcu,
-                         double   *inter_cost,
-                         double* inter_bitcost)
+                       const cu_loc_t * const  cu_loc,
+                       lcu_t *lcu,
+                       double   *inter_cost,
+                       double* inter_bitcost)
 {
   *inter_cost = MAX_DOUBLE;
   *inter_bitcost = MAX_INT;
+
    // Quick hashmap search
   /* uvg_search_hash_cu_ibc(
     state,
-                          x, y, depth,
+                          cu_loc,
                           lcu,
                           inter_cost,
                           inter_bitcost);
@@ -1330,8 +1313,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
   info.lcu = lcu;
 
   search_pu_ibc(state,
-                  x, y, depth,
-                  SIZE_2Nx2N, 0,
+                  cu_loc,
                   amvp,
                   &merge,
                   &info);
@@ -1374,14 +1356,14 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
     return;
   }
 
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int  x_local = SUB_SCU(cu_loc->x);
+  const int  y_local = SUB_SCU(cu_loc->y);
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   *cur_pu = *best_inter_pu;
   cur_pu->type       = CU_IBC;
 
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
-    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+  uvg_inter_recon_cu(state, lcu, 
+    true, state->encoder_control->chroma_format != UVG_CSP_400, cu_loc);   
 
   if (*inter_cost < MAX_DOUBLE) {    
     assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
diff --git a/src/search_ibc.h b/src/search_ibc.h
index 14ce3b6f..b3c4e544 100644
--- a/src/search_ibc.h
+++ b/src/search_ibc.h
@@ -46,7 +46,7 @@
 
 
 void uvg_search_cu_ibc(encoder_state_t * const state,
-                         int x, int y, int depth,
+                         const cu_loc_t * const  cu_loc,
                          lcu_t *lcu,
                          double *inter_cost,
                          double* inter_bitcost);
diff --git a/src/search_inter.c b/src/search_inter.c
index 6508995f..76c7fc36 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1293,8 +1293,8 @@ static void apply_mv_scaling(int32_t current_poc,
 /**
  * \brief Perform inter search for a single reference frame.
  */
-static void search_pu_inter_ref(inter_search_info_t *info,
-  int depth,
+static void search_pu_inter_ref(
+  inter_search_info_t *info,
   lcu_t *lcu,
   cu_info_t *cur_cu,
   unit_stats_map_t *amvp)
@@ -1327,15 +1327,15 @@ static void search_pu_inter_ref(inter_search_info_t *info,
   // Get MV candidates
   cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list];
 
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
+
   uvg_inter_get_mv_cand(info->state,
-    info->origin.x,
-    info->origin.y,
-    info->width,
-    info->height,
-    info->mv_cand,
-    cur_cu,
-    lcu,
-    ref_list);
+                        info->mv_cand,
+                        cur_cu,
+                        lcu,
+                        ref_list,
+                        &cu_loc);
 
   vector2d_t best_mv = { 0, 0 };
 
@@ -1498,11 +1498,13 @@ static void search_pu_inter_ref(inter_search_info_t *info,
 /**
  * \brief Search bipred modes for a PU.
  */
-static void search_pu_inter_bipred(inter_search_info_t *info,
-                                   int depth,
-                                   lcu_t *lcu,
-                                   unit_stats_map_t *amvp_bipred)
+static void search_pu_inter_bipred(
+  inter_search_info_t *info,
+  lcu_t *lcu,
+  unit_stats_map_t *amvp_bipred)
 {
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
   const image_list_t *const ref = info->state->frame->ref;
   uint8_t (*ref_LX)[16] = info->state->frame->ref_LX;
   const videoframe_t * const frame = info->state->tile->frame;
@@ -1551,7 +1553,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
     bipred_pu->skipped = false;
 
     for (int reflist = 0; reflist < 2; reflist++) {
-      uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+      uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, &cu_loc);
     }
 
     // Don't try merge candidates that don't satisfy mv constraints.
@@ -1564,13 +1566,11 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
     uvg_inter_recon_bipred(info->state,
                            ref->images[ref_LX[0][merge_cand[i].ref[0]]],
                            ref->images[ref_LX[1][merge_cand[j].ref[1]]],
-                           x, y,
-                           width,
-                           height,
                            mv,
                            lcu,
                            true,
-                           false);
+                           false,
+                           &cu_loc);
 
     const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
     const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride];
@@ -1666,11 +1666,9 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
  * \param amvp        Return searched AMVP PUs sorted by costs
  * \param merge       Return searched Merge PUs sorted by costs
  */
-static void search_pu_inter(encoder_state_t * const state,
-  int x_cu, int y_cu,
-  int depth,
-  part_mode_t part_mode,
-  int i_pu,
+static void search_pu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   unit_stats_map_t *amvp,
   unit_stats_map_t *merge,
@@ -1678,25 +1676,14 @@ static void search_pu_inter(encoder_state_t * const state,
 {
   const uvg_config *cfg = &state->encoder_control->cfg;
   const videoframe_t * const frame = state->tile->frame;
-  const int width_cu = LCU_WIDTH >> depth;
-  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height = PU_GET_H(part_mode, width_cu, i_pu);
+  const int width_cu = cu_loc->width;
+  const int height_cu = cu_loc->height; 
 
-  // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
-  // nRx2N partitions.
-  const bool merge_a1 = i_pu == 0 || width >= height;
-  // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
-  // 2NxnD partitions.
-  const bool merge_b1 = i_pu == 0 || width <= height;
 
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(cu_loc->x);
+  const int y_local = SUB_SCU(cu_loc->y);
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   cur_pu->type = CU_NOTSET;
-  cur_pu->part_size = part_mode;
-  cur_pu->depth = depth;
   cur_pu->qp = state->qp;
 
   // Default to candidate 0
@@ -1707,19 +1694,17 @@ static void search_pu_inter(encoder_state_t * const state,
 
   info->state          = state;
   info->pic            = frame->source;
-  info->origin.x       = x;
-  info->origin.y       = y;
-  info->width          = width;
-  info->height         = height;
+  info->origin.x       = cu_loc->x;
+  info->origin.y       = cu_loc->y;
+  info->width          = width_cu;
+  info->height         = height_cu;
   info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost;
-  info->optimized_sad  = uvg_get_optimized_sad(width);
+  info->optimized_sad  = uvg_get_optimized_sad(width_cu);
 
   // Search for merge mode candidates
   info->num_merge_cand = uvg_inter_get_merge_cand(
       state,
-      x, y,
-      width, height,
-      merge_a1, merge_b1,
+      cu_loc,
       info->merge_cand,
       lcu
   );
@@ -1754,7 +1739,7 @@ static void search_pu_inter(encoder_state_t * const state,
     // If bipred is not enabled, do not try candidates with mv_dir == 3.
     // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. 
     if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
-    if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue;
+    if (cur_pu->inter.mv_dir == 3 && !(cu_loc->width + cu_loc->height > 12)) continue;
 
     bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge);
 
@@ -1768,7 +1753,7 @@ static void search_pu_inter(encoder_state_t * const state,
     {
       continue;
     }
-    uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    uvg_inter_pred_pu(state, lcu, true, false, cu_loc);
     merge->unit[merge->size] = *cur_pu;
     merge->unit[merge->size].type = CU_INTER;
     merge->unit[merge->size].merge_idx = merge_idx;
@@ -1776,11 +1761,11 @@ static void search_pu_inter(encoder_state_t * const state,
     merge->unit[merge->size].skipped = false;
 
     double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
-    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    if(state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
     }
     else {
-      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+      merge->cost[merge->size] = uvg_satd_any_size(cu_loc->width, cu_loc->height,
         lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
         lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
       bits += no_skip_flag;
@@ -1802,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state,
     
   // Early Skip Mode Decision
   bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+  if (cfg->early_skip) {
     for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
       if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
         merge->size = 1;
@@ -1812,6 +1797,8 @@ static void search_pu_inter(encoder_state_t * const state,
         merge->keys[0] = 0;
       }
       else if(cfg->rdo < 2) {
+
+        const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
         // Reconstruct blocks with merge candidate.
         // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
         // and chroma exists.
@@ -1824,22 +1811,22 @@ static void search_pu_inter(encoder_state_t * const state,
         cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
         cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
         cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
-        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
 
-        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
+
+        if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
           continue;
         }
         else if (has_chroma) {
-          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
           uvg_quantize_lcu_residual(state,
                                     false, has_chroma,
                                     false, /*we are only checking for lack of coeffs so no need to check jccr*/
-                                    x, y, depth, cur_pu, lcu,
+                                    cu_loc, cur_pu, lcu,
                                     true,
-            UVG_BOTH_T);
-          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+                                    UVG_BOTH_T);
+          if (!cbf_is_set_any(cur_pu->cbf)) {
             cur_pu->type = CU_INTER;
             cur_pu->merge_idx = merge_idx;
             cur_pu->skipped = true;
@@ -1871,7 +1858,7 @@ static void search_pu_inter(encoder_state_t * const state,
     info->ref_idx = ref_idx;
     info->ref = state->frame->ref->images[ref_idx];
 
-    search_pu_inter_ref(info, depth, lcu, cur_pu, amvp);
+    search_pu_inter_ref(info, lcu, cur_pu, amvp);
   }
 
   assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE);
@@ -1936,14 +1923,11 @@ static void search_pu_inter(encoder_state_t * const state,
         info->ref = ref->images[info->ref_idx];
 
         uvg_inter_get_mv_cand(info->state,
-          info->origin.x,
-          info->origin.y,
-          info->width,
-          info->height,
-          info->mv_cand,
-          unipred_pu,
-          lcu,
-          list);
+                              info->mv_cand,
+                              unipred_pu,
+                              lcu,
+                              list,
+                              cu_loc);
 
         double     frac_cost = MAX_DOUBLE;
         double   frac_bits = MAX_INT;
@@ -1964,8 +1948,8 @@ static void search_pu_inter(encoder_state_t * const state,
           unipred_pu->inter.mv[list][1] = frac_mv.y;
           CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand);
 
-          if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-            uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits);
+          if (state->encoder_control->cfg.rdo >= 2) {
+            uvg_cu_cost_inter_rd2(state, unipred_pu, lcu, &frac_cost, &frac_bits, cu_loc);
           }
 
           amvp[list].cost[key] = frac_cost;
@@ -1987,15 +1971,15 @@ static void search_pu_inter(encoder_state_t * const state,
     amvp[list].size = n_best;
   }
 
-  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) {
-    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
-    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]);
+  if (state->encoder_control->cfg.rdo >= 2 && cfg->fme_level == 0) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);
+    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]], cu_loc);
   }
 
   // Search bi-pred positions
   bool can_use_bipred = state->frame->slicetype == UVG_SLICE_B
     && cfg->bipred
-    && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
+    && cu_loc->width + cu_loc->height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
 
   if (can_use_bipred) {
 
@@ -2026,25 +2010,23 @@ static void search_pu_inter(encoder_state_t * const state,
       bipred_pu->skipped = false;
 
       for (int reflist = 0; reflist < 2; reflist++) {
-        uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+        uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, cu_loc);
       }
 
       uvg_inter_recon_bipred(info->state,
-        ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
-        ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
-        x, y,
-        width,
-        height,
-        mv,
-        lcu,
-        true,
-        false);
+                             ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
+                             ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
+                             mv, lcu,
+                             true,
+                             false,
+                             cu_loc
+        );
 
-      const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
-      const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
+      const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
+      const uvg_pixel *src = &lcu->ref.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
 
       best_bipred_cost =
-        uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
+        uvg_satd_any_size(cu_loc->width, cu_loc->height, rec, LCU_WIDTH, src, LCU_WIDTH);
 
       double bitcost[2] = { 0, 0 };
 
@@ -2091,17 +2073,17 @@ static void search_pu_inter(encoder_state_t * const state,
     }
 
     // TODO: this probably should have a separate command line option
-    if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]);
+    if (cfg->rdo >= 3) search_pu_inter_bipred(info, lcu, &amvp[2]);
     
     assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE);
     uvg_sort_keys_by_cost(&amvp[2]);
-    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]);
+    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]], cu_loc);
     }
   }
   if(cfg->rdo < 2) {
     int predmode_ctx;
-    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
     const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
 
     const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@@ -2135,22 +2117,19 @@ static void search_pu_inter(encoder_state_t * const state,
 * \param inter_cost    Return inter cost
 * \param inter_bitcost Return inter bitcost
 */
-void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
-                           int x, int y, int depth,
-                           cu_info_t* cur_cu,
-                           lcu_t *lcu,
-                           double   *inter_cost,
-                           double* inter_bitcost){
+void uvg_cu_cost_inter_rd2(
+  encoder_state_t * const state,
+  cu_info_t* cur_cu,
+  lcu_t *lcu,
+  double   *inter_cost,
+  double* inter_bitcost,
+  const cu_loc_t* const cu_loc){
   
-  int tr_depth = MAX(1, depth);
-  if (cur_cu->part_size != SIZE_2Nx2N) {
-    tr_depth = depth + 1;
-  }
-  uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, UVG_BOTH_T);
+  const int x_px = SUB_SCU(cu_loc->x);
+  const int y_px = SUB_SCU(cu_loc->y);
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
 
-  const int x_px = SUB_SCU(x);
-  const int y_px = SUB_SCU(y);
-  const int width = LCU_WIDTH >> depth;
   cabac_data_t cabac_copy;
   memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
   cabac_data_t* cabac = &state->search_cabac;
@@ -2160,31 +2139,43 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
   *cur_pu = *cur_cu;
 
   const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
+  uvg_inter_recon_cu(state, lcu, true, reconstruct_chroma, cu_loc);
 
   int index = y_px * LCU_WIDTH + x_px;
   double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                    LCU_WIDTH, LCU_WIDTH,
-                                   width) * UVG_LUMA_MULT;
+                                   width, height) * UVG_LUMA_MULT;
   if (reconstruct_chroma) {
     int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
     double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                        LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width, cu_loc->chroma_height);
     double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                        LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width, cu_loc->chroma_height);
     ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
   }
   double no_cbf_bits;
   double bits = 0;
-  const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL);
-  if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+  const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
+
+  int8_t depth = 0;
+  int8_t mtt_depth = 0;
+  uint32_t splits = cur_cu->split_tree;
+  while (splits & 7) {
+    if ((splits & 7) != QT_SPLIT) {
+      mtt_depth++;
+    }
+    depth++;
+    splits >>= 3;
+  }
+  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth, 0};
+  if (cur_cu->merged) {
     no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
   }
   else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
     bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
   }
   double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
@@ -2194,20 +2185,20 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
     state->encoder_control->cfg.chroma_trskip_enable;
 
   double chroma_cost = 0;
-  if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && cur_cu->depth == cur_cu->tr_depth && reconstruct_chroma) {
+  if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && PU_IS_TU(cur_cu) && reconstruct_chroma) {
     uvg_quantize_lcu_residual(state,
                               true,
                               false,
-                              false, x, y,
-                              depth,
+                              false,
+                              cu_loc,
                               cur_cu,
                               lcu,
-                              false, 
-      UVG_BOTH_T);
+                              false,
+                              UVG_BOTH_T);
     ALIGNED(64) uvg_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) uvg_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
-    uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, width, LCU_WIDTH_C, width);
-    uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, width, LCU_WIDTH_C, width);
+    uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, height, LCU_WIDTH_C, width);
+    uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, height, LCU_WIDTH_C, width);
     ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
 
@@ -2216,6 +2207,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       u_pred,
       u_resi,
       width,
+      height,
       LCU_WIDTH_C,
       width);
     uvg_generate_residual(
@@ -2223,19 +2215,17 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       v_pred,
       v_resi,
       width,
+      height,
       LCU_WIDTH_C,
       width);
 
     uvg_chorma_ts_out_t chorma_ts_out;
     uvg_chroma_transform_search(
       state,
-      depth,
       lcu,
       &cabac_copy,
-      width,
-      width,
+      cu_loc,
       index,
-      0,
       cur_cu,
       u_pred,
       v_pred,
@@ -2243,41 +2233,41 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       v_resi,
       &chorma_ts_out,
       UVG_BOTH_T);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_cu->cbf, COLOR_U);
+    cbf_clear(&cur_cu->cbf, COLOR_V);
     if (chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) {
       cur_cu->joint_cb_cr = 0;
       cur_cu->tr_skip |= (chorma_ts_out.best_u_index == CHROMA_TS) << COLOR_U;
       cur_cu->tr_skip |= (chorma_ts_out.best_v_index == CHROMA_TS) << COLOR_V;
-      if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_U);
+      if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_V);
       chroma_cost += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost;
     }
     else {
       cur_cu->joint_cb_cr = chorma_ts_out.best_combined_index;
-      if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, COLOR_U);
+      if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, COLOR_V);
       chroma_cost += chorma_ts_out.best_combined_cost;
     }
   }
   else {
     uvg_quantize_lcu_residual(state,
                               true, reconstruct_chroma,
-                              reconstruct_chroma && state->encoder_control->cfg.jccr, x, y,
-                              depth,
+                              reconstruct_chroma && state->encoder_control->cfg.jccr,
+                              cu_loc,
                               cur_cu,
                               lcu,
-                              false, 
-      UVG_BOTH_T);    
+                              false,
+                              UVG_BOTH_T);    
   }
 
-  int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+  int cbf = cbf_is_set_any(cur_cu->cbf);
   
   if(cbf) {
-    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu);
+    *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
     if (reconstruct_chroma) {
-      if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
-        *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
+      if (!PU_IS_TU(cur_cu) || !state->encoder_control->cfg.jccr) {
+        *inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
       }
       else {
         *inter_cost += chroma_cost;
@@ -2297,7 +2287,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
 
   if(no_cbf_cost < *inter_cost) {
     cur_cu->cbf = 0;
-    if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+    if (cur_cu->merged) {
       cur_cu->skipped = 1;
     }
     *inter_cost = no_cbf_cost;
@@ -2321,11 +2311,12 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
  * \param inter_cost    Return inter cost
  * \param inter_bitcost Return inter bitcost
  */
-void uvg_search_cu_inter(encoder_state_t * const state,
-                         int x, int y, int depth,
-                         lcu_t *lcu,
-                         double   *inter_cost,
-                         double* inter_bitcost)
+void uvg_search_cu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  lcu_t *lcu,
+  double   *inter_cost,
+  double* inter_bitcost)
 {
   *inter_cost = MAX_DOUBLE;
   *inter_bitcost = MAX_INT;
@@ -2338,12 +2329,8 @@ void uvg_search_cu_inter(encoder_state_t * const state,
   inter_search_info_t info;
 
   search_pu_inter(state,
-                  x, y, depth,
-                  SIZE_2Nx2N, 0,
-                  lcu,
-                  amvp,
-                  &merge,
-                  &info);
+                  cu_loc, lcu, amvp,
+                  &merge, &info);
 
   // Early Skip CU decision
   if (merge.size == 1 && merge.unit[0].skipped) {
@@ -2385,13 +2372,14 @@ void uvg_search_cu_inter(encoder_state_t * const state,
     return;
   }
 
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(cu_loc->x);
+  const int y_local = SUB_SCU(cu_loc->y);
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   *cur_pu = *best_inter_pu;
 
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
-    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+  uvg_inter_recon_cu(state, lcu,
+                     true, state->encoder_control->chroma_format != UVG_CSP_400,
+                     cu_loc);   
 
   if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
     assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
diff --git a/src/search_inter.h b/src/search_inter.h
index d76dd927..cdabd15a 100644
--- a/src/search_inter.h
+++ b/src/search_inter.h
@@ -73,11 +73,12 @@ typedef double uvg_mvd_cost_func(const encoder_state_t *state,
                                   int32_t ref_idx,
                                   double *bitcost);
 
-void uvg_search_cu_inter(encoder_state_t * const state,
-                         int x, int y, int depth,
-                         lcu_t *lcu,
-                         double *inter_cost,
-                         double* inter_bitcost);
+void uvg_search_cu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
+  lcu_t *lcu,
+  double *inter_cost,
+  double* inter_bitcost);
 
 
 
@@ -85,12 +86,13 @@ unsigned uvg_inter_satd_cost(const encoder_state_t* state,
                              const lcu_t *lcu,
                              int x,
                              int y);
-void uvg_cu_cost_inter_rd2(encoder_state_t* const state,
-  int x, int y, int depth,
+void uvg_cu_cost_inter_rd2(
+  encoder_state_t* const state,
   cu_info_t* cur_cu,
   lcu_t* lcu,
   double* inter_cost,
-  double* inter_bitcost);
+  double* inter_bitcost,
+  const cu_loc_t* const cu_loc);
 
 int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);
 
diff --git a/src/search_intra.c b/src/search_intra.c
index 226c40c3..a644ed9c 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -49,6 +49,7 @@
 #include "strategies/strategies-picture.h"
 #include "videoframe.h"
 #include "strategies/strategies-quant.h"
+#include "uvg_math.h"
 
 
 // Normalize SAD for comparison against SATD to estimate transform skip
@@ -129,17 +130,31 @@ static INLINE uint8_t select_best_mode_index(const int8_t *modes, const double *
  *
  * \return  
  */
-static void get_cost_dual(encoder_state_t * const state, 
-                       const pred_buffer preds, const uvg_pixel *orig_block,
-                       cost_pixel_nxn_multi_func *satd_twin_func,
-                       cost_pixel_nxn_multi_func *sad_twin_func,
-                       int width, double *costs_out)
+static void get_cost_dual(
+  encoder_state_t * const state,
+  const pred_buffer preds,
+  const uvg_pixel *orig_block,
+  cost_pixel_nxn_multi_func *satd_twin_func,
+  cost_pixel_nxn_multi_func *sad_twin_func,
+  int width,
+  int height,
+  double *costs_out)
 {
   #define PARALLEL_BLKS 2
   unsigned satd_costs[PARALLEL_BLKS] = { 0 };
-  satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
+  if (satd_twin_func != NULL) {
+    satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
+  } else {
+    satd_costs[0] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[0], width);
+    satd_costs[1] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[1], width);
+  }
   unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 };
-  sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
+  if (sad_twin_func != NULL) {
+    sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
+  } else {
+    unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, width);
+    unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, width);
+  }
   costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2);
   costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2);
 
@@ -189,27 +204,29 @@ static void get_cost_dual(encoder_state_t * const state,
 * \param lcu_px   Position of the top left pixel of current CU within current LCU.
 */
 static void derive_mts_constraints(cu_info_t *const pred_cu,
-                                   lcu_t *const lcu, const int depth,
+                                   lcu_t *const lcu, const int width, const int height,
                                    const vector2d_t lcu_px)
 {
-  const int width = LCU_WIDTH >> depth;
-  int8_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+  int8_t scan_idx = SCAN_DIAG;
   int32_t i;
   // ToDo: large block support in VVC?
   uint32_t sig_coeffgroup_flag[32 * 32] = { 0 };
 
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0]
-    + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t *scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_idx];
-  const coeff_t* coeff = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, lcu_px.x, lcu_px.y)];
+  const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0]
+    + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height);
+  const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height);
+
+  coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  uvg_get_sub_coeff(coeff_y, lcu->coeff.y, lcu_px.x, lcu_px.y, width, height, LCU_WIDTH);
 
   signed scan_cg_last = -1;
   signed scan_pos_last = -1;
 
-  for (int i = 0; i < width * width; i++) {
-    if (coeff[scan[i]]) {
+  for (int i = 0; i < width * height; i++) {
+    if (coeff_y[scan[i]]) {
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
@@ -247,6 +264,7 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 }
 
 
+
 /**
 * \brief Perform search for best intra transform split configuration.
 *
@@ -262,51 +280,42 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 */
 static double search_intra_trdepth(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
-  int max_depth,
+  const cu_loc_t* const cu_loc,
   double cost_treshold,
   intra_search_data_t *const search_data,
   lcu_t *const lcu,
   enum uvg_tree_type tree_type)
 {
-  assert(depth >= 0 && depth <= MAX_PU_DEPTH);
-
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-  const int width_c = width > TR_MIN_WIDTH ? width / 2 : width;
-
-  const int offset = width / 2;
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+  const uint8_t width = cu_loc->width;
+  const uint8_t height = cu_loc->height; // TODO: height for non-square blocks
+  const uint8_t width_c = cu_loc->chroma_width;
+  const uint8_t height_c = cu_loc->chroma_height;
+  
+  const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
 
   const bool reconstruct_chroma = false;// (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != UVG_CSP_400;
   cu_info_t* pred_cu = &search_data->pred_cu;
-  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
-
-  struct {
-    uvg_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH];
-    uvg_pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH];
-    uvg_pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH];
-  } nosplit_pixels;
-  uint16_t nosplit_cbf = 0;
 
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
 
-  if (depth > 0) {
-    tr_cu->tr_depth = depth;
-    pred_cu->tr_depth = depth;
+  cabac_data_t cabac_data;
+  memcpy(&cabac_data, &state->search_cabac, sizeof(cabac_data_t));
+  state->search_cabac.update = 1;
+
+  if (width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH) {
 
     const bool mts_enabled = (state->encoder_control->cfg.mts == UVG_MTS_INTRA || state->encoder_control->cfg.mts == UVG_MTS_BOTH)
-      && tr_cu->depth == tr_cu->tr_depth;
+      && PU_IS_TU(pred_cu);
 
     nosplit_cost = 0.0;
+    const bool has_been_split = 1 << pred_cu->log2_width != cu_loc->width ||
+                                1 << pred_cu->log2_height != cu_loc->height;
 
-    cbf_clear(&pred_cu->cbf, depth, COLOR_Y);
+    cbf_clear(&pred_cu->cbf, COLOR_Y);
     if (reconstruct_chroma) {
-      cbf_clear(&pred_cu->cbf, depth, COLOR_U);
-      cbf_clear(&pred_cu->cbf, depth, COLOR_V);
+      cbf_clear(&pred_cu->cbf, COLOR_U);
+      cbf_clear(&pred_cu->cbf, COLOR_V);
     }
 
     const int8_t chroma_mode = reconstruct_chroma ? (!pred_cu->intra.mip_flag ? pred_cu->intra.mode : 0) : -1;
@@ -325,51 +334,53 @@ static double search_intra_trdepth(
     {
       trafo = 0;
       num_transforms = (mts_enabled ? MTS_TR_NUM : 1);
+      // Do not do MTS search if ISP mode is used
+      num_transforms = pred_cu->intra.isp_mode == ISP_MODE_NO_ISP ? num_transforms : 1;
     }
     const int mts_start = trafo;
-    //TODO: height
-    if (state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) {
+    if (state->encoder_control->cfg.trskip_enable 
+      && width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && height <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && PU_IS_TU(pred_cu)
+      && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
       num_transforms = MAX(num_transforms, 2);
     }
     pred_cu->intra.mode_chroma = -1;
-    pred_cu->joint_cb_cr = 4;
     
     const int max_tb_size = TR_MAX_WIDTH;
     // LFNST search params
-    int max_lfnst_idx = width > max_tb_size || height > max_tb_size ?
-                                0 :
-                                2;
+    int max_lfnst_idx = width > max_tb_size || height > max_tb_size ? 0 : 2;
     if(pred_cu->intra.mip_flag && (width < 16 || height < 16)) {
       max_lfnst_idx = 0;
     }
-
+    
     int start_idx = 0;
-    int end_idx = state->encoder_control->cfg.lfnst && depth == pred_cu->
-                  tr_depth ?
-                    max_lfnst_idx :
-                    0;
-    for (int i = start_idx; i < end_idx + 1; ++i) {
+    int end_lfnst_idx = state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
+                  uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? max_lfnst_idx : 0;
+    for (int i = start_idx; i < end_lfnst_idx + 1; ++i) {
       search_data->lfnst_costs[i] = MAX_DOUBLE;
     }
 
-
-    for (int lfnst_idx = start_idx; lfnst_idx <= end_idx; lfnst_idx++) {
-      // Initialize lfnst variables
-      pred_cu->lfnst_idx = lfnst_idx;
-      pred_cu->violates_lfnst_constrained_luma = false;
-      pred_cu->violates_lfnst_constrained_chroma = false;
-      pred_cu->lfnst_last_scan_pos = false;
-
-      for (trafo = mts_start; trafo < num_transforms; trafo++) {
+    for (trafo = mts_start; trafo < num_transforms; trafo++) {
+      for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) {
+        // Initialize lfnst variables
+        search_data->best_isp_cbfs = 0;
         pred_cu->tr_idx = trafo;
         pred_cu->tr_skip = trafo == MTS_SKIP;
-        bool constraints[2] = { false, false};
+        pred_cu->lfnst_idx = lfnst_idx;
+        pred_cu->violates_lfnst_constrained_luma = false;
+        pred_cu->violates_lfnst_constrained_chroma = false;
+        pred_cu->lfnst_last_scan_pos = false;
+
+        bool constraints[2] = {false, false};
         if (mts_enabled) {
           pred_cu->mts_last_scan_pos = 0;
           pred_cu->violates_mts_coeff_constraint = 0;
 
-          if ((trafo == MTS_SKIP && width > (1 << state->encoder_control->cfg.trskip_max_size))
-            || !state->encoder_control->cfg.trskip_enable) {
+          if (trafo == MTS_SKIP && ((width > (1 << state->encoder_control->cfg.trskip_max_size)
+            || (height > (1 << state->encoder_control->cfg.trskip_max_size)))
+            || !PU_IS_TU(pred_cu)
+            || !state->encoder_control->cfg.trskip_enable)) {
             continue;
           }
         }
@@ -377,65 +388,80 @@ static double search_intra_trdepth(
         if (pred_cu->lfnst_idx > 0 && pred_cu->tr_idx > 0) {
           continue;
         }
-        
-        uvg_intra_recon_cu(
-          state,
-          x_px,
-          y_px,
-          depth,
-          search_data,
-          pred_cu,
-          lcu,
-          UVG_LUMA_T,
-          true,
-          false);
 
-        if (trafo != 0 && !cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) continue;
+        if (!has_been_split && (lfnst_idx != 0 || trafo != 0)) {
+          memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
+          state->search_cabac.update = 1;
+        }
+        double rd_cost;
+        if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP) {
+          rd_cost = uvg_recon_and_estimate_cost_isp(
+            state,
+            cu_loc,
+            cost_treshold,
+            search_data,
+            lcu,
+            &constraints[0]
+          );
+          constraints[1] = search_data->best_isp_cbfs != 0;
+        }
+        else {
+          uvg_intra_recon_cu(
+            state,
+            search_data,
+            cu_loc,
+            pred_cu,
+            lcu,
+            UVG_LUMA_T,
+            true,
+            false
+          );
+        }
+        if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue;
+
+        if ((trafo != 0 || lfnst_idx != 0) && !cbf_is_set(pred_cu->cbf, COLOR_Y)) continue;
         
-        derive_mts_constraints(pred_cu, lcu, depth, lcu_px);
+        derive_mts_constraints(pred_cu, lcu, width, height, lcu_px);
         if (pred_cu->tr_idx > 1) {
           if (pred_cu->violates_mts_coeff_constraint || !pred_cu->
               mts_last_scan_pos) {
             continue;
           }
         }
-
-        const unsigned scan_offset = xy_to_zorder(
-          LCU_WIDTH,
-          lcu_px.x,
-          lcu_px.y);
-
-        if (trafo != MTS_SKIP && end_idx != 0) {
+        
+        if (trafo != MTS_SKIP && end_lfnst_idx != 0 && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
           uvg_derive_lfnst_constraints(
             pred_cu,
-            depth,
             constraints,
-            &lcu->coeff.y[scan_offset],
+            lcu->coeff.y,
             width,
-            height
-            );
+            height,
+            &lcu_px,
+            COLOR_Y);
         }
 
-        if (!constraints[1] && cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) {
+        if (!constraints[1] && cbf_is_set(pred_cu->cbf, COLOR_Y)) {
           //end_idx = 0;
           if (pred_cu->lfnst_idx > 0) {
             continue;
           }
         }
-        double rd_cost = uvg_cu_rd_cost_luma(
-          state,
-          lcu_px.x,
-          lcu_px.y,
-          depth,
-          pred_cu,
-          lcu);
+
+
+        if (pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+          rd_cost = uvg_cu_rd_cost_luma(
+            state,
+            cu_loc,
+            pred_cu,
+            lcu,
+            search_data->best_isp_cbfs);
+        }
         double transform_bits = 0;
-        if (state->encoder_control->cfg.lfnst && depth == pred_cu->tr_depth &&
-            trafo != MTS_SKIP) {
-          if (!constraints[0] && constraints[1]) {
+        if (state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) &&
+          trafo != MTS_SKIP && end_lfnst_idx != 0 && (cbf_is_set(pred_cu->cbf, COLOR_Y) || search_data->best_isp_cbfs != 0)) {
+          if ((!constraints[0] && (constraints[1] || pred_cu->intra.isp_mode != ISP_MODE_NO_ISP))) {
             transform_bits += CTX_ENTROPY_FBITS(
-              &state->search_cabac.ctx.lfnst_idx_model[tr_cu->depth == 4 ||
-                tree_type == UVG_LUMA_T],
+              &state->search_cabac.ctx.lfnst_idx_model[tree_type == UVG_LUMA_T],
               lfnst_idx != 0);
             if (lfnst_idx > 0) {
               transform_bits += CTX_ENTROPY_FBITS(
@@ -444,10 +470,14 @@ static double search_intra_trdepth(
             }
           }
         }
-        if (num_transforms > 2 && trafo != MTS_SKIP && width <= 32
-            /*&& height <= 32*/
+        if (num_transforms > 2 && trafo != MTS_SKIP
+            && (cbf_is_set(pred_cu->cbf, COLOR_Y) || search_data->best_isp_cbfs != 0)
+            && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP
+            && lfnst_idx == 0
+            && width <= 32
+            && height <= 32
             && !pred_cu->violates_mts_coeff_constraint && pred_cu->
-            mts_last_scan_pos && lfnst_idx == 0) {
+            mts_last_scan_pos) {
 
           bool symbol = trafo != 0;
           int ctx_idx = 0;
@@ -464,7 +494,7 @@ static double search_intra_trdepth(
           }
 
         }
-        rd_cost += transform_bits * state->frame->lambda;
+        rd_cost += transform_bits * state->lambda;
 
         search_data->lfnst_costs[lfnst_idx] = MIN(
           search_data->lfnst_costs[lfnst_idx],
@@ -480,30 +510,22 @@ static double search_intra_trdepth(
       if (reconstruct_chroma) {
         int8_t luma_mode = pred_cu->intra.mode;
         pred_cu->intra.mode_chroma = chroma_mode;
-        pred_cu->joint_cb_cr = 4;
         // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
-        const unsigned scan_offset = xy_to_zorder(
-          LCU_WIDTH_C,
-          lcu_px.x,
-          lcu_px.y);
         uvg_intra_recon_cu(
           state,
-          x_px,
-          y_px,
-          depth,
           search_data,
+          cu_loc,
           pred_cu,
           lcu,
           UVG_BOTH_T,
           false,
-          true);
+          true
+          );
         best_rd_cost += uvg_cu_rd_cost_chroma(
           state,
-          lcu_px.x,
-          lcu_px.y,
-          depth,
           pred_cu,
-          lcu);
+          lcu,
+          cu_loc);
         pred_cu->intra.mode = luma_mode;
 
         // Check lfnst constraints for chroma
@@ -513,24 +535,24 @@ static double search_intra_trdepth(
                                  pred_cu->lfnst_last_scan_pos};
           uvg_derive_lfnst_constraints(
             pred_cu,
-            depth,
             constraints,
-            &lcu->coeff.u[scan_offset],
+            lcu->coeff.u,
             width_c,
-            width_c
-            );
+            height_c,
+            &lcu_px,
+            COLOR_U);
           if (constraints[0] || !constraints[1]) {
             best_lfnst_idx = 0;
             continue;
           }
           uvg_derive_lfnst_constraints(
             pred_cu,
-            depth,
             constraints,
-            &lcu->coeff.u[scan_offset],
+            lcu->coeff.u,
             width_c,
-            width_c
-            );
+            height_c,
+            &lcu_px,
+            COLOR_U);
           if (constraints[0] || !constraints[1]) {
             best_lfnst_idx = 0;
             continue;
@@ -542,14 +564,13 @@ static double search_intra_trdepth(
     if(reconstruct_chroma) {
       int8_t luma_mode = pred_cu->intra.mode;
       pred_cu->intra.mode_chroma = chroma_mode;
-      pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
       uvg_intra_recon_cu(state,
-                         x_px, y_px,
-                         depth, search_data,
-                         pred_cu, 
-                         lcu, 
-                         UVG_BOTH_T,false,true);
-      best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+                         search_data, cu_loc,
+                         pred_cu, lcu,
+                         UVG_BOTH_T,
+                         false,
+                         true);
+      best_rd_cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
       pred_cu->intra.mode = luma_mode;
     }
     pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@@ -562,35 +583,10 @@ static double search_intra_trdepth(
     // Early stop condition for the recursive search.
     // If the cost of any 1/4th of the transform is already larger than the
     // whole transform, assume that splitting further is a bad idea.
-    if (nosplit_cost >= cost_treshold) {
+    if (nosplit_cost <= cost_treshold) {
+      memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
       return nosplit_cost;
     }
-
-    nosplit_cbf = pred_cu->cbf;
-
-    uvg_pixels_blit(
-      lcu->rec.y,
-      nosplit_pixels.y,
-      width,
-      width,
-      LCU_WIDTH,
-      width);
-    if (reconstruct_chroma) {
-      uvg_pixels_blit(
-        lcu->rec.u,
-        nosplit_pixels.u,
-        width_c,
-        width_c,
-        LCU_WIDTH_C,
-        width_c);
-      uvg_pixels_blit(
-        lcu->rec.v,
-        nosplit_pixels.v,
-        width_c,
-        width_c,
-        LCU_WIDTH_C,
-        width_c);
-    }
   }
     
   
@@ -599,63 +595,32 @@ static double search_intra_trdepth(
   //   - Maximum transform hierarchy depth is constrained by clipping
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
-  if (depth < max_depth && depth < MAX_PU_DEPTH) {
+  else {
     split_cost = 0;
 
-    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
-    if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
     }
-    if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
     }
-    if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type);
+    else {
+      split = BT_HOR_SPLIT;
     }
 
-    double cbf_bits = 0.0;
-
-    // Add cost of cbf chroma bits on transform tree.
-    // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true
-    // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0
-    // if this and any previous transform block has no chroma coefficients.
-    // When searching the first block we don't actually know the real values,
-    // so this will code cbf as 0 and not code the cbf at all for descendants.
-    if (state->encoder_control->chroma_format != UVG_CSP_400) {
-      const uint8_t tr_depth = depth - pred_cu->depth;
-      cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
-
-      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb");
-      }
-      ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]);
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr");
-      }
+    cu_loc_t split_cu_loc[4];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    for (int i = 0; i < split_count; ++i) {
+      split_cost += search_intra_trdepth(state, &split_cu_loc[i], nosplit_cost, search_data, lcu, tree_type);
     }
-
-    double bits = cbf_bits;
-    split_cost += bits * state->lambda;
-  } else {
-    assert(width <= TR_MAX_WIDTH);
   }
+  memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data));
 
-  if (depth == 0 || split_cost < nosplit_cost) {
+  if (!PU_IS_TU(pred_cu) || split_cost < nosplit_cost) {
     return split_cost;
   } else {
-    uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth, tree_type);
-
-    pred_cu->cbf = nosplit_cbf;
-
-    // We only restore the pixel data and not coefficients or cbf data.
-    // The only thing we really need are the border pixels.uvg_intra_get_dir_luma_predictor
-    uvg_pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH);
-    if (reconstruct_chroma) {
-      uvg_pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C);
-      uvg_pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C);
-    }
-
     return nosplit_cost;
   }
 }
@@ -679,25 +644,31 @@ static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length)
 
 static int search_intra_chroma_rough(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
-  const uvg_pixel *orig_u,
-  const uvg_pixel *orig_v,
-  int16_t origstride,
-  uvg_intra_references *refs_u,
-  uvg_intra_references *refs_v,
   intra_search_data_t* chroma_data,
   lcu_t* lcu,
   int8_t luma_mode,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  assert(depth != 4 || (x_px & 4 && y_px & 4));
+  const int_fast8_t log2_width_c = uvg_g_convert_to_log2[cu_loc->chroma_width];
+  const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
+  const vector2d_t luma_px = { cu_loc->x, cu_loc->y};
+  const int width = 1 << log2_width_c;
+  const int height = width; // TODO: height for non-square blocks
 
-  const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
+  const cu_loc_t loc = { luma_px.x, luma_px.y, width, height, width, height };
+
+  uvg_intra_references refs_u;
+  uvg_intra_build_reference(state, &loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0, 0);
+
+  uvg_intra_references refs_v;
+  uvg_intra_build_reference(state, &loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
+
+  vector2d_t lcu_cpx = { (cu_loc->local_x) / 2, (cu_loc->local_y) / 2 };
+  uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+  uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
   
   //cost_pixel_nxn_func *const sad_func = uvg_pixels_get_sad_func(width);
-  cu_loc_t loc = { x_px & ~7, y_px & ~7, width, width, width, width };
     
   uvg_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
   uvg_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
@@ -705,12 +676,12 @@ static int search_intra_chroma_rough(
   uvg_pixel _orig_block[32 * 32 + SIMD_ALIGNMENT];
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
-  uvg_pixels_blit(orig_u, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig_u, orig_block, width, height, LCU_WIDTH_C, width);
   int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5);
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_u, cu_loc, &loc, COLOR_U, pred, &chroma_data[i], lcu);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -725,11 +696,11 @@ static int search_intra_chroma_rough(
     }
   }
 
-  uvg_pixels_blit(orig_v, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig_v, orig_block, width, height, LCU_WIDTH_C, width);
   for (int i = 0; i < modes_count; ++i) {
     const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma;
     if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue;
-    uvg_intra_predict(state, refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type);
+    uvg_intra_predict(state, &refs_v, cu_loc, &loc, COLOR_V, pred, &chroma_data[i], lcu);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     switch (width) {
       case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block);
@@ -818,7 +789,7 @@ static int16_t search_intra_rough(
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
   // Store original block for SAD computation
-  uvg_pixels_blit(orig, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig, orig_block, width, height, origstride, width);
 
   int8_t modes_selected = 0;
   // Note: get_cost and get_cost_dual may return negative costs.
@@ -837,7 +808,7 @@ static int16_t search_intra_rough(
 
   // Calculate SAD for evenly spaced modes to select the starting point for 
   // the recursive search.
-  cu_loc_t loc = { 0, 0, width, width, width, width };
+  cu_loc_t loc = { 0, 0, width, height, width, height };
   intra_search_data_t search_proxy;
   FILL(search_proxy, 0);
   search_proxy.pred_cu = *pred_cu;
@@ -984,8 +955,9 @@ static INLINE double count_bits(
   const double not_mpm_mode_bit,
   const double planar_mode_flag,
   const double not_planar_mode_flag,
+  const double not_isp_flag,
   int8_t mode
-  )
+)
 {
   int i = 0;
   int smaller_than_pred = 0;
@@ -1007,7 +979,7 @@ static INLINE double count_bits(
   else {
     bits = not_mpm_mode_bit + 5 + (mode - smaller_than_pred > 2);
   }
-  bits += not_mrl + not_mip;
+  bits += not_mrl + not_mip + not_isp_flag;
   return bits;
 }
 
@@ -1017,19 +989,19 @@ static uint8_t search_intra_rough(
   uvg_pixel *orig,
   int32_t origstride,
   uvg_intra_references *refs,
-  int log2_width,
+  int width,
+  int height,
   int8_t *intra_preds,
   intra_search_data_t* modes_out,
   cu_info_t* const pred_cu,
   uint8_t mip_ctx)
 {
   #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future?
-  assert(log2_width >= 2 && log2_width <= 5);
-  int_fast8_t width = 1 << log2_width;
+  assert(width >= 4 && width <= 32);
   // cost_pixel_nxn_func *satd_func = kvz_pixels_get_satd_func(width);
   // cost_pixel_nxn_func *sad_func = kvz_pixels_get_sad_func(width);
-  cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width);
-  cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+  cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width, height);
+  cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width, height);
   bool mode_checked[UVG_NUM_INTRA_MODES] = {0};
   double costs[UVG_NUM_INTRA_MODES];
 
@@ -1044,7 +1016,7 @@ static uint8_t search_intra_rough(
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
   // Store original block for SAD computation
-  uvg_pixels_blit(orig, orig_block, width, width, origstride, width);
+  uvg_pixels_blit(orig, orig_block, width, height, origstride, width);
 
   int8_t modes_selected = 0;
   // Note: get_cost and get_cost_dual may return negative costs.
@@ -1055,13 +1027,14 @@ static uint8_t search_intra_rough(
     int8_t mode;
     double cost;
   };
-
+  
   const double not_mrl = state->encoder_control->cfg.mrl && (cu_loc->y % LCU_WIDTH) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 0) : 0;
   const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0;
   const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1);
   const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0);
   const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 0);
   const double not_planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1);
+  const double not_isp_flag = state->encoder_control->cfg.isp && uvg_can_use_isp(width, height) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_subpart_model[0]), 0) : 0;
 
   const uint8_t mode_list_size = state->encoder_control->cfg.mip ? 6 : 3;
   struct mode_cost best_six_modes[6];
@@ -1070,17 +1043,16 @@ static uint8_t search_intra_rough(
 
   // Calculate SAD for evenly spaced modes to select the starting point for 
   // the recursive search.
-  cu_loc_t loc = { 0, 0, width, width, width, width };
   intra_search_data_t search_proxy;
   FILL(search_proxy, 0);
   search_proxy.pred_cu = *pred_cu;
 
   int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels;
   search_proxy.pred_cu.intra.mode = 0;
-  uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
+  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL);
   search_proxy.pred_cu.intra.mode = 1;
-  uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T);
-  get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs);
+  uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL);
+  get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs);
   mode_checked[0] = true;
   mode_checked[1] = true;
   costs[0] += count_bits(
@@ -1092,7 +1064,7 @@ static uint8_t search_intra_rough(
     not_mpm_mode_bit,
     planar_mode_flag,
     not_planar_mode_flag,
-    0) * state->lambda_sqrt;
+    not_isp_flag, 0) * state->lambda_sqrt;
   costs[1] += count_bits(
     state,
     intra_preds,
@@ -1102,7 +1074,7 @@ static uint8_t search_intra_rough(
     not_mpm_mode_bit,
     planar_mode_flag,
     not_planar_mode_flag,
-    1) * state->lambda_sqrt;
+    not_isp_flag, 1) * state->lambda_sqrt;
   if(costs[0] < costs[1]) {
     min_cost = costs[0];
     max_cost = costs[1];
@@ -1129,12 +1101,12 @@ static uint8_t search_intra_rough(
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
         search_proxy.pred_cu.intra.mode = mode + i*offset;
-        uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T);
+        uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL);
       }
     }
     
     //TODO: add generic version of get cost  multi
-    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
         costs_out[i] += count_bits(
@@ -1146,7 +1118,7 @@ static uint8_t search_intra_rough(
           not_mpm_mode_bit,
           planar_mode_flag,
           not_planar_mode_flag,
-          mode + i * offset) * state->lambda_sqrt;
+          not_isp_flag, mode + i * offset) * state->lambda_sqrt;
       }
     }
 
@@ -1201,12 +1173,12 @@ static uint8_t search_intra_rough(
       
         for (int block = 0; block < PARALLEL_BLKS; ++block) {
           search_proxy.pred_cu.intra.mode = modes_to_check[block + i];
-          uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T);
+          uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL);
         
         }
 
         //TODO: add generic version of get cost multi
-        get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+        get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
         for (int block = 0; block < PARALLEL_BLKS; ++block) {
             costs_out[block] += count_bits(
               state,
@@ -1217,7 +1189,7 @@ static uint8_t search_intra_rough(
               not_mpm_mode_bit,
               planar_mode_flag,
               not_planar_mode_flag,
-              modes_to_check[block + i]) * state->lambda_sqrt;
+              not_isp_flag, modes_to_check[block + i]) * state->lambda_sqrt;
           
         }
 
@@ -1270,8 +1242,12 @@ static void get_rough_cost_for_2n_modes(
 #define PARALLEL_BLKS 2
   assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_2n_modes");
   const int width = cu_loc->width;
-  cost_pixel_nxn_multi_func* satd_dual_func = uvg_pixels_get_satd_dual_func(width);
-  cost_pixel_nxn_multi_func* sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+  const int height = cu_loc->height;
+  cost_pixel_nxn_multi_func* satd_dual_func;
+  cost_pixel_nxn_multi_func* sad_dual_func;
+  satd_dual_func = uvg_pixels_get_satd_dual_func(width, height);
+  sad_dual_func = uvg_pixels_get_sad_dual_func(width, height);
+
 
   uvg_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT];
   pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT);
@@ -1279,7 +1255,7 @@ static void get_rough_cost_for_2n_modes(
   uvg_pixel _orig_block[MIN(LCU_WIDTH, 64) * MIN(LCU_WIDTH, 64) + SIMD_ALIGNMENT];
   uvg_pixel* orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
-  uvg_pixels_blit(orig, orig_block, width, width, orig_stride, width);
+  uvg_pixels_blit(orig, orig_block, width, height, orig_stride, width);
   
   const double mrl = state->encoder_control->cfg.mrl && (cu_loc->y % LCU_WIDTH) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 1) : 0;
   const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0;
@@ -1288,9 +1264,9 @@ static void get_rough_cost_for_2n_modes(
   double bits[PARALLEL_BLKS] = { 0 };
   for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) {
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL, UVG_LUMA_T);
+      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL);
     }
-    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out);
 
     for(int i = 0; i < PARALLEL_BLKS; ++i) {
       uint8_t multi_ref_idx = search_data[mode + i].pred_cu.intra.multi_ref_idx;
@@ -1345,28 +1321,58 @@ static void get_rough_cost_for_2n_modes(
  */
 static int8_t search_intra_rdo(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
   int modes_to_check,
   intra_search_data_t *search_data,
   lcu_t *lcu,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
+  const int width = cu_loc->width;
+  const int height = cu_loc->height; // TODO: height for non-square blocks
   
   for (int mode = 0; mode < modes_to_check; mode++) {
-    double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu);
-    search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
-    search_data[mode].bits = rdo_bitcost;
-    search_data[mode].cost = rdo_bitcost * state->lambda;
+    bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false : true; // Cannot use ISP with MIP
+    // can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL
+    const uint8_t mrl_idx = search_data[mode].pred_cu.intra.multi_ref_idx;
+    double best_isp_cost = MAX_DOUBLE;
+    double best_bits = MAX_DOUBLE;
+    int8_t best_isp_mode = 0;
+    int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1;
 
-    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type);
-    search_data[mode].cost += mode_cost;
-    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) {
-      modes_to_check = mode + 1;
-      break;
+    //
+    uint8_t best_mts_mode_for_isp[NUM_ISP_MODES] = {0};
+    uint8_t best_lfnst_mode_for_isp[NUM_ISP_MODES] = {0};
+    for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) {
+       
+
+      search_data[mode].pred_cu.intra.isp_mode = isp_mode;
+      search_data[mode].pred_cu.intra.multi_ref_idx = isp_mode == ISP_MODE_NO_ISP ? mrl_idx : 0;
+      double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, cu_loc, lcu);
+      search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
+      search_data[mode].bits = rdo_bitcost;
+      search_data[mode].cost = rdo_bitcost * state->lambda;
+
+      double mode_cost = search_intra_trdepth(state, cu_loc, MAX_INT, &search_data[mode], lcu, tree_type);
+      best_mts_mode_for_isp[isp_mode] = search_data[mode].pred_cu.tr_idx;
+      best_lfnst_mode_for_isp[isp_mode] = search_data[mode].pred_cu.lfnst_idx;
+      search_data[mode].cost += mode_cost;
+      if (search_data[mode].cost < best_isp_cost) {
+        best_isp_cost = search_data[mode].cost;
+        best_isp_mode = isp_mode;
+        best_bits = search_data[mode].bits;
+      }
+      if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf)) {
+        modes_to_check = mode + 1;
+        break;
+      }
     }
+    search_data[mode].cost = best_isp_cost;
+    search_data[mode].bits = best_bits;
+    search_data[mode].pred_cu.intra.isp_mode = best_isp_mode;
+    search_data[mode].pred_cu.intra.multi_ref_idx = best_isp_mode == ISP_MODE_NO_ISP ? mrl_idx : 0;
+    search_data[mode].pred_cu.tr_idx = best_mts_mode_for_isp[best_isp_mode];
+    search_data[mode].pred_cu.tr_skip = best_mts_mode_for_isp[best_isp_mode] == MTS_SKIP;
+    search_data[mode].pred_cu.lfnst_idx = best_lfnst_mode_for_isp[best_isp_mode];
   }
 
   // Update order according to new costs
@@ -1384,7 +1390,9 @@ static int8_t search_intra_rdo(
 }
 
 
-double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu)
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t*
+                          const cu_loc,
+                          const lcu_t* lcu)
 {
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
   double mode_bits = 0;
@@ -1393,8 +1401,8 @@ double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const c
   uvg_encode_intra_luma_coding_unit(
     state,
     &cabac_copy, cur_cu,
-    x, y, depth, lcu, &mode_bits
-  );
+    cu_loc, lcu, &mode_bits
+    );
 
   return mode_bits;
 }
@@ -1436,20 +1444,20 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
 
 int8_t uvg_search_intra_chroma_rdo(
   encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
   int8_t num_modes,
   lcu_t *const lcu,
+  const cu_loc_t* const cu_loc,
   intra_search_data_t* chroma_data,
   int8_t luma_mode,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  bool is_separate)
 {
-  const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
-
-
+  const bool reconstruct_chroma = true;
+  
+  const int chroma_width  = cu_loc->chroma_width;
+  const int chroma_height = cu_loc->chroma_height;
   uvg_intra_references refs[2];
-  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const vector2d_t luma_px = { cu_loc->x, cu_loc->y };
   const vector2d_t pic_px = {
     state->tile->frame->width,
     state->tile->frame->height,
@@ -1457,29 +1465,21 @@ int8_t uvg_search_intra_chroma_rdo(
 
 
   if (reconstruct_chroma) {
-    int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
-    uvg_intra_build_reference(log2_width, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
-    uvg_intra_build_reference(log2_width, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
     
-    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+    const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
     cabac_data_t temp_cabac;
     memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
-    int8_t width = 1 << log2_width;
-    int8_t height = 1 << log2_width;
-    const cu_loc_t loc = { x_px &~7, y_px & ~7, width, height, width, height};
-    const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
+    
+    const int offset = ((cu_loc->local_x) >> 1) + ((cu_loc->local_y) >> 1)* LCU_WIDTH_C;
 
     int lfnst_modes_to_check[3];
-    if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
+    if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && PU_IS_TU(&chroma_data->pred_cu) && chroma_height >= 4 && chroma_width >= 4) {
       for (int i = 0; i < 3; ++i) {
         lfnst_modes_to_check[i] = i;
       }
     }
-    else if(chroma_data->pred_cu.lfnst_idx) {
-      lfnst_modes_to_check[0] = chroma_data->pred_cu.lfnst_idx;
-      lfnst_modes_to_check[1] = -1;
-      lfnst_modes_to_check[2] = -1;
-    }
     else {
       lfnst_modes_to_check[0] = 0;
       lfnst_modes_to_check[1] = -1;
@@ -1490,11 +1490,15 @@ int8_t uvg_search_intra_chroma_rdo(
     ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
 
+    double original_c_lambda = state->c_lambda;
+    state->quant_blocks[2].needs_init = true;
+    state->rate_estimator[1].needs_init = true;
 
     for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) {
       const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma;
       double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode);
-      chroma_data[mode_i].cost = mode_bits * state->lambda;
+      chroma_data[mode_i].cost = mode_bits * state->c_lambda;
+      chroma_data[mode_i].bits = mode_bits;
       cu_info_t* pred_cu = &chroma_data[mode_i].pred_cu;
       uint8_t best_lfnst_index = 0;
       for (int lfnst_i = 0; lfnst_i < 3; ++lfnst_i) {
@@ -1502,58 +1506,58 @@ int8_t uvg_search_intra_chroma_rdo(
         if (lfnst == -1) {
           continue;
         }
+        state->c_lambda = original_c_lambda * (state->encoder_control->cfg.jccr && state->qp > 18 ? 1.3 : 1.0);
         pred_cu->cr_lfnst_idx = lfnst;
-        chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->lambda;
-        if (pred_cu->tr_depth == pred_cu->depth) {
+        chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->c_lambda;
+        if (PU_IS_TU(pred_cu) && (tree_type != UVG_CHROMA_T || (pred_cu->log2_chroma_width < 5 && pred_cu->log2_chroma_height < 5))) {
           uvg_intra_predict(
             state,
             &refs[COLOR_U - 1],
-            &loc,
+            cu_loc,
+            cu_loc,
             COLOR_U,
             u_pred,
             &chroma_data[mode_i],
-            lcu,
-            tree_type);
+            lcu);
           uvg_intra_predict(
             state,
             &refs[COLOR_V - 1],
-            &loc,
+            cu_loc,
+            cu_loc,
             COLOR_V,
             v_pred,
             &chroma_data[mode_i],
-            lcu,
-            tree_type);
+            lcu);
           uvg_generate_residual(
             &lcu->ref.u[offset],
             u_pred,
             u_resi,
-            width,
+            chroma_width,
+            chroma_height,
             LCU_WIDTH_C,
-            width);
+            chroma_width);
           uvg_generate_residual(
             &lcu->ref.v[offset],
             v_pred,
             v_resi,
-            width,
+            chroma_width,
+            chroma_height,
             LCU_WIDTH_C,
-            width);
+            chroma_width);
           uvg_chorma_ts_out_t chorma_ts_out;
           uvg_chroma_transform_search(
             state,
-            depth,
             lcu,
             &temp_cabac,
-            width,
-            height,
+            cu_loc,
             offset,
-            mode,
             pred_cu,
             u_pred,
             v_pred,
             u_resi,
             v_resi,
             &chorma_ts_out,
-            tree_type);
+            is_separate ? UVG_CHROMA_T : tree_type);
 
           // LFNST constraint failed
           if(chorma_ts_out.best_u_index == -1 && chorma_ts_out.best_combined_index == -1) {
@@ -1561,8 +1565,9 @@ int8_t uvg_search_intra_chroma_rdo(
             continue;
           }
 
+          double actual_cost = state->lambda * (chorma_ts_out.u_bits + chorma_ts_out.v_bits + mode_bits) + (chorma_ts_out.u_distortion + chorma_ts_out.v_distortion);
           if(chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) {
-            chroma_data[mode_i].lfnst_costs[lfnst] += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost;
+            chroma_data[mode_i].lfnst_costs[lfnst] = actual_cost;
             if( chroma_data[mode_i].lfnst_costs[lfnst] 
                 < chroma_data[mode_i].lfnst_costs[best_lfnst_index] || lfnst_i == 0) {
               chroma_data[mode_i].pred_cu.joint_cb_cr = 0;
@@ -1574,7 +1579,7 @@ int8_t uvg_search_intra_chroma_rdo(
             }
           }
           else {
-            chroma_data[mode_i].lfnst_costs[lfnst] += chorma_ts_out.best_combined_cost;
+            chroma_data[mode_i].lfnst_costs[lfnst] = actual_cost;
             if (chroma_data[mode_i].lfnst_costs[lfnst]
               < chroma_data[mode_i].lfnst_costs[best_lfnst_index] || lfnst_i == 0) {
               chroma_data[mode_i].pred_cu.joint_cb_cr = chorma_ts_out.best_combined_index;
@@ -1583,17 +1588,18 @@ int8_t uvg_search_intra_chroma_rdo(
               chroma_data[mode_i].cost = chroma_data[mode_i].lfnst_costs[lfnst];
             }
           }
+
         }
         else {
           state->search_cabac.update = 1;
-          chroma_data[mode_i].cost = mode_bits * state->lambda;
+          chroma_data[mode_i].cost = mode_bits * state->c_lambda;
           uvg_intra_recon_cu(state,
-                             x_px, y_px,
-                             depth, &chroma_data[mode_i],
-                             pred_cu,
-                             lcu,
-                             tree_type, false, true);
-          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+                             &chroma_data[mode_i], cu_loc,
+                             pred_cu, lcu,
+                             tree_type,
+                             false,
+                             true);
+          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
           memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
         }
       }
@@ -1602,6 +1608,7 @@ int8_t uvg_search_intra_chroma_rdo(
     }
     sort_modes(chroma_data, num_modes);
     
+    state->c_lambda = original_c_lambda;
     return chroma_data[0].pred_cu.intra.mode_chroma;
   }
 
@@ -1612,26 +1619,25 @@ int8_t uvg_search_intra_chroma_rdo(
 
 int8_t uvg_search_cu_intra_chroma(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t *search_data,
-  enum uvg_tree_type tree_type)
+  int8_t luma_mode,
+  enum uvg_tree_type tree_type,
+  bool is_separate)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
 
   const cu_info_t *cur_pu = &search_data->pred_cu;
-  int8_t intra_mode = !cur_pu->intra.mip_flag ? cur_pu->intra.mode : 0;
   
-  int8_t modes[8] = { 0, 50, 18, 1, intra_mode, 81, 82, 83 };
-  uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5);
+  int8_t modes[8] = { 0, 50, 18, 1, luma_mode, 81, 82, 83 };
+  uint8_t total_modes = (state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_pu, tree_type) ? 8 : 5);
   for(int i = 0; i < 4; i++) {
-    if (modes[i] == intra_mode) {
+    if (modes[i] == luma_mode) {
       modes[i] = 66;
       break;
     }
   }
+  
 
   // The number of modes to select for slower chroma search. Luma mode
   // is always one of the modes, so 2 means the final decision is made
@@ -1648,9 +1654,9 @@ int8_t uvg_search_cu_intra_chroma(
   FILL(chroma_data, 0);
   for (int i = 0; i < num_modes; i++) {
     chroma_data[i].pred_cu = *cur_pu;
-    chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? intra_mode : modes[i];
+    chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? luma_mode : modes[i];
     chroma_data[i].cost = 0;
-    if(depth != 4 && tree_type == UVG_BOTH_T) {
+    if(!is_separate && tree_type == UVG_BOTH_T) {
       memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3);
     }
   }
@@ -1659,34 +1665,15 @@ int8_t uvg_search_cu_intra_chroma(
   // num_modes is 0.is 0.
 
   if(state->encoder_control->cfg.cclm && 0){
-    const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
-    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-    const vector2d_t luma_px = { x_px & ~7, y_px & ~7};
+    
 
-    uvg_intra_references refs_u;
-    uvg_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0);
-
-    uvg_intra_references refs_v;
-    uvg_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0);
-
-    vector2d_t lcu_cpx = { (lcu_px.x & ~7) / 2, (lcu_px.y & ~7) / 2 };
-    uvg_pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
-    uvg_pixel *ref_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
-
-    num_modes = search_intra_chroma_rough(state, x_px, y_px, depth,
-                                          ref_u,
-                                          ref_v,
-                                          LCU_WIDTH_C,
-                                          &refs_u,
-                                          &refs_v,
-                                          chroma_data,
-                                          lcu,
-                                          intra_mode,
-                                          tree_type);
+    num_modes = search_intra_chroma_rough(state, chroma_data, lcu, luma_mode,
+                                          tree_type,
+                                          cu_loc);
   }
   
   if (num_modes > 1 || state->encoder_control->cfg.jccr) {
-    uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode, tree_type);
+    uvg_search_intra_chroma_rdo(state, num_modes, lcu, cu_loc, chroma_data, luma_mode, tree_type, is_separate);
   }
   else if(cur_pu->lfnst_idx) {
     chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx;
@@ -1782,19 +1769,15 @@ static int select_candidates_for_further_search(const encoder_state_t * const st
  */
 void uvg_search_cu_intra(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
   intra_search_data_t* mode_out,
   lcu_t *lcu,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-  const int8_t cu_width = LCU_WIDTH >> depth;
-  const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width,
-    MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) };
-  const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
-  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
+  const int8_t log2_width = uvg_g_convert_to_log2[cu_loc->width];
+  const int8_t log2_height = uvg_g_convert_to_log2[cu_loc->width];
+  const vector2d_t luma_px = { cu_loc->x, cu_loc->y};
   const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
 
   cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
@@ -1810,25 +1793,22 @@ void uvg_search_cu_intra(
 
   // Select left and top CUs if they are available.
   // Top CU is not available across LCU boundary.
-  if (x_px >= SCU_WIDTH) {
-    left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y+ cu_width-1);
+  if (cu_loc->x >= SCU_WIDTH) {
+    left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y+ cu_loc->height-1);
   }
-  if (y_px >= SCU_WIDTH && lcu_px.y > 0) {
-    above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x+ cu_width-1, lcu_px.y - 1);
+  if (cu_loc->y >= SCU_WIDTH && lcu_px.y > 0) {
+    above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x+ cu_loc->width-1, lcu_px.y - 1);
   }
-  int8_t num_cand = uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
+  int8_t num_cand = uvg_intra_get_dir_luma_predictor(cu_loc->x, cu_loc->y, candidate_modes, cur_cu, left_cu, above_cu);
 
-  if (depth > 0) {
-    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
+  bool is_large = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
+  if (!is_large) {
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0);
   }
-
-  // The maximum number of possible MIP modes depend on block size & shape
-  int width = LCU_WIDTH >> depth;
-  int height = width; // TODO: proper height for non-square blocks.
-
+  
   // This is needed for bit cost calculation and requires too many parameters to be
   // calculated inside the rough search functions
-  uint8_t mip_ctx = uvg_get_mip_flag_context(x_px, y_px, cu_width, cu_width, lcu, NULL);
+  uint8_t mip_ctx = uvg_get_mip_flag_context(cu_loc, lcu, NULL);
 
   // Find best intra mode for 2Nx2N.
   uvg_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
@@ -1839,24 +1819,25 @@ void uvg_search_cu_intra(
   temp_pred_cu.type = CU_INTRA;
   FILL(temp_pred_cu.intra, 0);
   // Find modes with multiple reference lines if in use. Do not use if CU in first row.
-  uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1;
+  uint8_t lines = state->encoder_control->cfg.mrl && lcu_px.y != 0 ? MAX_REF_LINE_IDX : 1;
 
   uint8_t number_of_modes;
   uint8_t num_regular_modes;
-  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4);
+  bool skip_rough_search = (is_large || state->encoder_control->cfg.rdo >= 4);
   if (!skip_rough_search) {
     num_regular_modes = number_of_modes = search_intra_rough(
-      state,
-      &cu_loc,
-      ref_pixels,
-      LCU_WIDTH,
-      refs,
-      log2_width,
-      candidate_modes,
-      search_data,
-      &temp_pred_cu,
-      mip_ctx);
-     // if(lines == 1) sort_modes(search_data, number_of_modes);
+                          state,
+                          cu_loc,
+                          ref_pixels,
+                          LCU_WIDTH,
+                          refs,
+                          cu_loc->width,
+                          cu_loc->height,
+                          candidate_modes,
+                          search_data,
+                          &temp_pred_cu,
+                          mip_ctx);
+    // if(lines == 1) sort_modes(search_data, number_of_modes);
 
   } else {
     for (int8_t i = 0; i < UVG_NUM_INTRA_MODES; i++) {
@@ -1870,7 +1851,7 @@ void uvg_search_cu_intra(
   }
 
   uint8_t num_mrl_modes = 0;
-  for(int line = 1; line < lines; ++line) {
+  for(int line = 1; line < lines && !is_large; ++line) {
     uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 };
 
     if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0) {
@@ -1878,7 +1859,7 @@ void uvg_search_cu_intra(
 
       // Copy extra ref lines, including ref line 1 and top left corner.
       for (int i = 0; i < MAX_REF_LINE_IDX; ++i) {
-        int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX;
+        int height = (cu_loc->height) * 2 + MAX_REF_LINE_IDX;
         height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
         height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
         uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)],
@@ -1887,7 +1868,7 @@ void uvg_search_cu_intra(
           frame->rec->stride, 1);
       }
     }
-    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
+    uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0);
     for(int i = 1; i < INTRA_MPM_COUNT; i++) {
       num_mrl_modes++;
       const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
@@ -1899,7 +1880,7 @@ void uvg_search_cu_intra(
     }
   }
   if (!skip_rough_search && lines != 1) {
-    get_rough_cost_for_2n_modes(state, refs, &cu_loc,
+    get_rough_cost_for_2n_modes(state, refs, cu_loc,
                                 ref_pixels,
                                 LCU_WIDTH, search_data + number_of_modes, num_mrl_modes,
                                 mip_ctx);
@@ -1912,11 +1893,11 @@ void uvg_search_cu_intra(
   int num_mip_modes = 0;
   if (state->encoder_control->cfg.mip) {
     // MIP is not allowed for 64 x 4 or 4 x 64 blocks
-    if (!((width == 64 && height == 4) || (width == 4 && height == 64))) {
-      num_mip_modes = NUM_MIP_MODES_FULL(width, height);
+    if (!((cu_loc->height == 64 && cu_loc->width== 4) || (cu_loc->height== 4 && cu_loc->width == 64))) {
+      num_mip_modes = NUM_MIP_MODES_FULL(cu_loc->width, cu_loc->height);
 
       for (int transpose = 0; transpose < 2; transpose++) {
-        const int half_mip_modes = NUM_MIP_MODES_HALF(width, height);
+        const int half_mip_modes = num_mip_modes / 2;
         for (int i = 0; i < half_mip_modes; ++i) {
           const int index = i + number_of_modes + transpose * half_mip_modes;
           search_data[index].pred_cu = temp_pred_cu;
@@ -1928,7 +1909,7 @@ void uvg_search_cu_intra(
         }
       }
       if (!skip_rough_search) {
-        get_rough_cost_for_2n_modes(state, refs, &cu_loc,
+        get_rough_cost_for_2n_modes(state, refs, cu_loc,
           ref_pixels,
           LCU_WIDTH, search_data + number_of_modes, num_mip_modes,
           mip_ctx);
@@ -1937,9 +1918,6 @@ void uvg_search_cu_intra(
     number_of_modes += num_mip_modes;
   }
 
-
-  // Set transform depth to current depth, meaning no transform splits.
-  uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth, tree_type);
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
@@ -1956,7 +1934,7 @@ void uvg_search_cu_intra(
         {2, 3, 3, 3, 3, 2},  //  64x4,  64x8,  64x16,  64x32,  64x64,  64x128,
         {2, 2, 2, 2, 2, 3},  // 128x4, 128x8, 128x16, 128x32, 128x64, 128x128,
       };
-      number_of_modes_to_search = g_aucIntraModeNumFast_UseMPM_2D[7- depth - 3][7 - depth - 3];
+      number_of_modes_to_search = g_aucIntraModeNumFast_UseMPM_2D[log2_width - 2][log2_height - 2];
     } else {
       // Check only the predicted modes.
       number_of_modes_to_search = 0;
@@ -1968,8 +1946,8 @@ void uvg_search_cu_intra(
           search_data,
           num_regular_modes,
           num_mip_modes,
-          width,
-          height
+          cu_loc->width,
+          cu_loc->height
         );
       }
     }
@@ -1991,16 +1969,16 @@ void uvg_search_cu_intra(
         number_of_modes_to_search++;
       }
     }
-    
+
+    state->quant_blocks[0].needs_init = 1;
+    state->rate_estimator[0].needs_init = 1;
     search_intra_rdo(
       state,
-      x_px,
-      y_px,
-      depth,
       number_of_modes_to_search,
       search_data,
       lcu,
-      tree_type);
+      tree_type,
+      cu_loc);
     search_data[0].pred_cu.mts_last_scan_pos = false;
     search_data[0].pred_cu.violates_mts_coeff_constraint = false;
   }
diff --git a/src/search_intra.h b/src/search_intra.h
index 36470e63..ebcec26e 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -43,27 +43,27 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 
-double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu);
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t*
+                          const cu_loc,
+                          const lcu_t* lcu);
                        
 double uvg_chroma_mode_bits(const encoder_state_t *state,
                         int8_t chroma_mode, int8_t luma_mode);
 
 int8_t uvg_search_cu_intra_chroma(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
   lcu_t *lcu,
   intra_search_data_t* best_cclm,
-  enum uvg_tree_type tree_type);
+  int8_t luma_mode,
+  enum uvg_tree_type tree_type,
+  bool is_separate);
 
 void uvg_search_cu_intra(
   encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
   intra_search_data_t* search_data,
   lcu_t *lcu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc);
 
 #endif // SEARCH_INTRA_H_
diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index b695273b..081b1b25 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -52,10 +52,17 @@ extern const int16_t uvg_g_dct_8_t[8][8];
 extern const int16_t uvg_g_dct_16_t[16][16];
 extern const int16_t uvg_g_dct_32_t[32][32];
 
-#if COMPILE_INTEL_AVX2
+#define COMPILE_INTEL_AVX2 1
+
+#if COMPILE_INTEL_AVX2 
 #include "uvg266.h"
 #if UVG_BIT_DEPTH == 8
 #include <immintrin.h>
+#include "strategies/avx2/dct_avx2_tables.h"
+#define MAX_LOG2_TR_DYNAMIC_RANGE 15
+#define TRANSFORM_MATRIX_SHIFT    6
+#define INVERSE_SHIFT_1ST (TRANSFORM_MATRIX_SHIFT + 1)
+#define INVERSE_SHIFT_2ND (TRANSFORM_MATRIX_SHIFT + MAX_LOG2_TR_DYNAMIC_RANGE - 1 - UVG_BIT_DEPTH)
 
 /*
 * \file
@@ -73,6 +80,583 @@ static INLINE __m256i truncate_avx2(__m256i v, __m256i debias, int32_t shift)
   return              _mm256_srai_epi32(truncable, shift);
 }
 
+
+// TODO: find avx2 solution for transpose
+// TODO: attempt to make a generic transpose for avx2. Needs some extra logic for different widths and heights.
+// TODO: make a few solutions for exact sizes and see if some pattern emerges...
+static void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const int height) {
+  const int sample_num = width * height;
+  const int vectors = sample_num / 16;
+
+  int16_t* d_ptr = dst;
+  if (vectors == 0) {
+    return;
+  }
+  else if (vectors == 1) {
+
+  }
+  else {
+    // Reserve enough storage for max transform size 32x32
+    __m256i v_16b_result[64];
+    __m256i v_32b_result[64];
+    __m256i v_64b_result[64];
+    __m256i v_128b_result[64];
+
+    // Handle two source vectors at a time
+    for (int i = 0; i < vectors; i += 2) {
+      __m256i v_src_0 = _mm256_load_si256((const __m256i*)src);
+      __m256i v_src_1 = _mm256_load_si256((const __m256i*)(src + 16));
+
+      v_16b_result[i] = _mm256_unpacklo_epi16(v_src_0, v_src_1);
+      v_16b_result[i + 1] = _mm256_unpackhi_epi16(v_src_0, v_src_1);
+
+      src += 32;
+    }
+
+    // 32 bit shuffle pass
+    int loop_idx = 0;
+    for (int i = 0; i < vectors; i += 2) {
+      const int idx_a = loop_idx;
+      const int idx_b = loop_idx + 2;
+
+      v_32b_result[i] = _mm256_unpacklo_epi32(v_16b_result[idx_a], v_16b_result[idx_b]);
+      v_32b_result[i + 1] = _mm256_unpackhi_epi32(v_16b_result[idx_a], v_16b_result[idx_b]);
+      loop_idx++;
+    }
+
+    // 64 bit shuffle pass
+    loop_idx = 0;
+    for (int i = 0; i < vectors; i += 2) {
+      const int idx_a = loop_idx;
+      const int idx_b = loop_idx + 4;
+
+      v_64b_result[i] = _mm256_unpacklo_epi32(v_32b_result[idx_a], v_32b_result[idx_b]);
+      v_64b_result[i + 1] = _mm256_unpackhi_epi32(v_32b_result[idx_a], v_32b_result[idx_b]);
+      loop_idx++;
+    }
+
+    // Final 128 bit shuffle pass
+    for (int i = 0; i < vectors; i += 2) {
+      const int idx_a = 0;
+      const int idx_b = 0;
+
+      v_128b_result[i] = _mm256_unpacklo_epi32(v_64b_result[idx_a], v_64b_result[idx_b]);
+      v_128b_result[i + 1] = _mm256_unpackhi_epi32(v_64b_result[idx_a], v_64b_result[idx_b]);
+    }
+
+    // Store loop
+    for (int i = 0; i < vectors; ++i) {
+      _mm256_store_si256((__m256i*)dst, v_128b_result[i]);
+      dst += 16;
+    }
+  }
+}
+
+static void transpose_generic(const int16_t* src, int16_t* dst, const int width, const int height)
+{
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      dst[x * height + y] = src[y * width + x];
+    }
+  }
+}
+
+
+typedef void (transpose_func)(const __m256i* src, __m256i* dst);
+
+
+static void transpose_2x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x16_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_2x32_avx2(const __m256i* src, __m256i* dst)
+{
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle);
+  v_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle);
+  v_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle);
+  v_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle);
+
+  v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0));
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31);
+}
+static void transpose_2x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_4x16_avx2(const __m256i* src, __m256i* dst)
+{
+  const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
+                                            31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
+
+  // const __m256i v_shuffle = _mm256_set_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  //                                           16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31);
+
+  __m256i v_src_tmp[4];
+  v_src_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle);
+  v_src_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle);
+  v_src_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle);
+  v_src_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle);
+
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
+  v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
+  v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
+  v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
+
+  __m256i v_tmp16_lo[2];
+  __m256i v_tmp16_hi[2];
+  v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+
+  v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31);
+  dst[2] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31);
+}
+static void transpose_4x32_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp[8];
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+  for (int i = 0; i < 8; ++i) {
+    v_tmp[i] = _mm256_shuffle_epi8(src[i], v_shuffle);
+    v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_tmp[i] = _mm256_shuffle_epi32(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_tmp64_lo[4];
+  __m256i v_tmp64_hi[4];
+  v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp[0], v_tmp[1]);
+  v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp[2], v_tmp[3]);
+  v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp[4], v_tmp[5]);
+  v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp[6], v_tmp[7]);
+
+  v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp[0], v_tmp[1]);
+  v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp[2], v_tmp[3]);
+  v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp[4], v_tmp[5]);
+  v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp[6], v_tmp[7]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20);
+
+  dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31);
+  dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31);
+  dst[6] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31);
+}
+static void transpose_4x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_8x16_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[4];
+  __m256i v_tmp16_hi[4];
+  __m256i v_tmp32_lo[4];
+  __m256i v_tmp32_hi[4];
+  __m256i v_tmp64_lo[4];
+  __m256i v_tmp64_hi[4];
+  __m256i v_tmp128[8];
+
+  v_tmp128[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20);
+  v_tmp128[1] = _mm256_permute2x128_si256(src[0], src[4], 0x31);
+  v_tmp128[2] = _mm256_permute2x128_si256(src[1], src[5], 0x20);
+  v_tmp128[3] = _mm256_permute2x128_si256(src[1], src[5], 0x31);
+  v_tmp128[4] = _mm256_permute2x128_si256(src[2], src[6], 0x20);
+  v_tmp128[5] = _mm256_permute2x128_si256(src[2], src[6], 0x31);
+  v_tmp128[6] = _mm256_permute2x128_si256(src[3], src[7], 0x20);
+  v_tmp128[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31);
+
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(v_tmp128[0], v_tmp128[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(v_tmp128[2], v_tmp128[3]);
+  v_tmp16_lo[2] = _mm256_unpacklo_epi16(v_tmp128[4], v_tmp128[5]);
+  v_tmp16_lo[3] = _mm256_unpacklo_epi16(v_tmp128[6], v_tmp128[7]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(v_tmp128[0], v_tmp128[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(v_tmp128[2], v_tmp128[3]);
+  v_tmp16_hi[2] = _mm256_unpackhi_epi16(v_tmp128[4], v_tmp128[5]);
+  v_tmp16_hi[3] = _mm256_unpackhi_epi16(v_tmp128[6], v_tmp128[7]);
+
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+
+  dst[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  dst[1] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  dst[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  dst[3] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  dst[4] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  dst[5] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  dst[6] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+  dst[7] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+}
+static void transpose_8x32_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  __m256i v_tmp32_lo[8];
+  __m256i v_tmp32_hi[8];
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+
+  const __m256i v_shuffle = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31);
+  for (int i = 0; i < 8; ++i) {
+    const int offset = i * 2;
+    v_tmp16_lo[i] = _mm256_unpacklo_epi16(src[offset], src[offset + 1]);
+    v_tmp16_hi[i] = _mm256_unpackhi_epi16(src[offset], src[offset + 1]);
+  }
+
+  for (int i = 0; i < 8; i += 4) {
+    v_tmp32_lo[i + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]);
+    v_tmp32_lo[i + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]);
+    v_tmp32_lo[i + 2] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]);
+    v_tmp32_lo[i + 3] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]);
+
+    v_tmp32_hi[i + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]);
+    v_tmp32_hi[i + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]);
+    v_tmp32_hi[i + 2] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]);
+    v_tmp32_hi[i + 3] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]);
+  }
+
+  for (int i = 0; i < 8; i += 4) {
+    v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]);
+    v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]);
+    v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]);
+    v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]);
+
+    v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]);
+    v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]);
+    v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]);
+    v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    v_tmp64_lo[i] = _mm256_permute4x64_epi64(v_tmp64_lo[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_tmp64_hi[i] = _mm256_permute4x64_epi64(v_tmp64_hi[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  dst[0] = _mm256_shuffle_epi8(v_tmp64_lo[0], v_shuffle);
+  dst[1] = _mm256_shuffle_epi8(v_tmp64_lo[4], v_shuffle);
+  dst[2] = _mm256_shuffle_epi8(v_tmp64_hi[0], v_shuffle);
+  dst[3] = _mm256_shuffle_epi8(v_tmp64_hi[4], v_shuffle);
+
+  dst[4] = _mm256_shuffle_epi8(v_tmp64_lo[2], v_shuffle);
+  dst[5] = _mm256_shuffle_epi8(v_tmp64_lo[6], v_shuffle);
+  dst[6] = _mm256_shuffle_epi8(v_tmp64_hi[2], v_shuffle);
+  dst[7] = _mm256_shuffle_epi8(v_tmp64_hi[6], v_shuffle);
+
+  dst[8] = _mm256_shuffle_epi8(v_tmp64_lo[1], v_shuffle);
+  dst[9] = _mm256_shuffle_epi8(v_tmp64_lo[5], v_shuffle);
+  dst[10] = _mm256_shuffle_epi8(v_tmp64_hi[1], v_shuffle);
+  dst[11] = _mm256_shuffle_epi8(v_tmp64_hi[5], v_shuffle);
+
+  dst[12] = _mm256_shuffle_epi8(v_tmp64_lo[3], v_shuffle);
+  dst[13] = _mm256_shuffle_epi8(v_tmp64_lo[7], v_shuffle);
+  dst[14] = _mm256_shuffle_epi8(v_tmp64_hi[3], v_shuffle);
+  dst[15] = _mm256_shuffle_epi8(v_tmp64_hi[7], v_shuffle);
+}
+static void transpose_8x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_16x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_16x4_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[2];
+  __m256i v_tmp16_hi[2];
+  __m256i v_tmp32_lo[2];
+  __m256i v_tmp32_hi[2];
+
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]);
+
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31);
+}
+static void transpose_16x8_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[4];
+  __m256i v_tmp16_hi[4];
+  __m256i v_tmp32_lo[4];
+  __m256i v_tmp32_hi[4];
+  __m256i v_tmp64_lo[4];
+  __m256i v_tmp64_hi[4];
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]);
+  v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[5]);
+  v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[6], src[7]);
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]);
+  v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[5]);
+  v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[6], src[7]);
+
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]);
+  v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]);
+  v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]);
+
+  v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+  v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]);
+  v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]);
+  v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]);
+  v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20);
+  dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31);
+  dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31);
+  dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31);
+}
+
+static void transpose_16x16_avx2_stride(const int16_t* src, int16_t* dst, const int src_stride, const int dst_stride) {
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_tmp16_lo[d] = _mm256_unpacklo_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride));
+    v_tmp16_hi[d] = _mm256_unpackhi_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride));
+  }
+
+  __m256i v_tmp32_lo[8];
+  __m256i v_tmp32_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 2, s += 2) {
+    v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]);
+    v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]);
+    v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]);
+    v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]);
+  }
+
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 4, s += 4) {
+    v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+    v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+
+    v_tmp64_lo[d + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_lo[d + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+    v_tmp64_hi[d + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_hi[d + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+  }
+
+  _mm256_storeu_si256((__m256i*)(dst + 0 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 1 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 2 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 3 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 4 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 5 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 6 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x20));
+  _mm256_storeu_si256((__m256i*)(dst + 7 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x20));
+
+  _mm256_storeu_si256((__m256i*)(dst + 8 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 9 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 10 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 11 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 12 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 13 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 14 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x31));
+  _mm256_storeu_si256((__m256i*)(dst + 15 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x31));
+}
+
+static void transpose_16x16_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 16);
+}
+
+static void transpose_16x32_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16 * 16, (int16_t*)dst + 16, 16, 32);
+
+}
+static void transpose_16x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_32x2_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo0 = _mm256_unpacklo_epi16(src[0], src[2]);
+  __m256i v_tmp16_lo1 = _mm256_unpacklo_epi16(src[1], src[3]);
+  __m256i v_tmp16_hi0 = _mm256_unpackhi_epi16(src[0], src[2]);
+  __m256i v_tmp16_hi1 = _mm256_unpackhi_epi16(src[1], src[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x31);
+  dst[2] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x31);
+}
+static void transpose_32x4_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[4];
+  __m256i v_tmp16_hi[4];
+  v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[2]);
+  v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[1], src[3]);
+  v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[6]);
+  v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[5], src[7]);
+
+  v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[2]);
+  v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[1], src[3]);
+  v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[6]);
+  v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[5], src[7]);
+
+  __m256i v_tmp32_lo[4];
+  __m256i v_tmp32_hi[4];
+  v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[2]);
+  v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[1], v_tmp16_lo[3]);
+  v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[2]);
+  v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[1], v_tmp16_hi[3]);
+
+  v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[2]);
+  v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[1], v_tmp16_lo[3]);
+  v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[2]);
+  v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[1], v_tmp16_hi[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x31);
+
+  dst[4] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20);
+  dst[5] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x20);
+  dst[6] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x31);
+}
+static void transpose_32x8_avx2(const __m256i* src, __m256i* dst)
+{
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 2, s += 4) {
+    v_tmp16_lo[d + 0] = _mm256_unpacklo_epi16(src[s + 0], src[s + 2]);
+    v_tmp16_lo[d + 1] = _mm256_unpacklo_epi16(src[s + 1], src[s + 3]);
+
+    v_tmp16_hi[d + 0] = _mm256_unpackhi_epi16(src[s + 0], src[s + 2]);
+    v_tmp16_hi[d + 1] = _mm256_unpackhi_epi16(src[s + 1], src[s + 3]);
+  }
+
+  __m256i v_tmp32_lo[8];
+  __m256i v_tmp32_hi[8];
+  for (int d = 0, s = 0; d < 4; d += 2, s += 4) {
+    v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]);
+    v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]);
+    v_tmp32_lo[d + 4] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]);
+    v_tmp32_lo[d + 5] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]);
+
+    v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]);
+    v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]);
+    v_tmp32_hi[d + 4] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]);
+    v_tmp32_hi[d + 5] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]);
+  }
+
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+  for (int d = 0, s = 0; d < 4; d += 2, s += 4) {
+    v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+    v_tmp64_lo[d + 4] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_lo[d + 5] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+
+    v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]);
+    v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]);
+    v_tmp64_hi[d + 4] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]);
+    v_tmp64_hi[d + 5] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]);
+  }
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x20);
+
+  dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31);
+  dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x31);
+  dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31);
+  dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x31);
+
+  dst[8]  = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20);
+  dst[9]  = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x20);
+  dst[10] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20);
+  dst[11] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x20);
+
+  dst[12] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31);
+  dst[13] = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x31);
+  dst[14] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31);
+  dst[15] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x31);
+}
+static void transpose_32x16_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src,                        (int16_t *)dst, 32, 16);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16,           (int16_t *)dst + 16 * 16, 32, 16);
+}
+static void transpose_32x32_avx2(const __m256i* src, __m256i* dst) {
+  transpose_16x16_avx2_stride((int16_t const *)src,                        (int16_t *)dst, 32, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16,           (int16_t *)dst + 16 * 32, 32, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32,      (int16_t *)dst + 16, 32, 32);
+  transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32 + 16, (int16_t *)dst + 16 * 32 + 16, 32, 32);
+}
+static void transpose_32x64_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x2_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x4_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x8_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x16_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x32_avx2(const __m256i* src, __m256i* dst){}
+static void transpose_64x64_avx2(const __m256i* src, __m256i* dst){}
+
+
+
+static transpose_func* transpose_func_table[6][6] = {
+  { transpose_2x2_avx2,  transpose_4x2_avx2,  transpose_8x2_avx2,  transpose_16x2_avx2,  transpose_32x2_avx2,  transpose_64x2_avx2},
+  { transpose_2x4_avx2,  transpose_4x4_avx2,  transpose_8x4_avx2,  transpose_16x4_avx2,  transpose_32x4_avx2,  transpose_64x4_avx2},
+  { transpose_2x8_avx2,  transpose_4x8_avx2,  transpose_8x8_avx2,  transpose_16x8_avx2,  transpose_32x8_avx2,  transpose_64x8_avx2},
+  {transpose_2x16_avx2, transpose_4x16_avx2, transpose_8x16_avx2, transpose_16x16_avx2, transpose_32x16_avx2, transpose_64x16_avx2},
+  {transpose_2x32_avx2, transpose_4x32_avx2, transpose_8x32_avx2, transpose_16x32_avx2, transpose_32x32_avx2, transpose_64x32_avx2},
+  {transpose_2x64_avx2, transpose_4x64_avx2, transpose_8x64_avx2, transpose_16x64_avx2, transpose_32x64_avx2, transpose_64x64_avx2},
+};
+
+
+// Dispatcher function for avx2 transposes. This calls the proper subfunction
+static void transpose_avx2(const __m256i* src, __m256i* dst, const int width, const int height)
+{
+  // No need to transpose something of width or height 1
+  const int w_log2_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int h_log2_minus1 = uvg_g_convert_to_log2[height] - 1;
+  
+  transpose_func* func = transpose_func_table[h_log2_minus1][w_log2_minus1];
+  func(src, dst);
+}
+
+
 // 4x4 matrix multiplication with value clipping.
 // Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses,
 //             destination for the result and the shift value for clipping.
@@ -945,12 +1529,6 @@ ITRANSFORM(dct, 32);
 /*****************************************************/
 
 // DST-7
-#define DEFINE_DST7_P4_MATRIX(a,b,c,d) { \
-    { a,  b,  c,  d},\
-    { c,  c,  0, -c},\
-    { d, -a, -c,  b},\
-    { b, -d,  c, -a},\
-}
 
 #define DEFINE_DST7_P4_MATRIX_T(a,b,c,d) { \
     { a,  c,  d,  b},\
@@ -959,17 +1537,6 @@ ITRANSFORM(dct, 32);
     { d, -c,  b, -a},\
 }
 
-#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \
-{\
-   { a,  b,  c,  d,  e,  f,  g,  h},\
-   { c,  f,  h,  e,  b, -a, -d, -g},\
-   { e,  g,  b, -c, -h, -d,  a,  f},\
-   { g,  c, -d, -f,  a,  h,  b, -e},\
-   { h, -a, -g,  b,  f, -c, -e,  d},\
-   { f, -e, -a,  g, -d, -b,  h, -c},\
-   { d, -h,  e, -a, -c,  g, -f,  b},\
-   { b, -d,  f, -h,  g, -e,  c, -a},\
-}
 
 #define DEFINE_DST7_P8_MATRIX_T(a,b,c,d,e,f,g,h) \
 {\
@@ -983,25 +1550,6 @@ ITRANSFORM(dct, 32);
    { h, -g,  f, -e,  d, -c,  b, -a,},\
 }\
 
-#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
-{ \
-   { a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p}, \
-   { c,  f,  i,  l,  o,  o,  l,  i,  f,  c,  0, -c, -f, -i, -l, -o}, \
-   { e,  j,  o,  m,  h,  c, -b, -g, -l, -p, -k, -f, -a,  d,  i,  n}, \
-   { g,  n,  l,  e, -b, -i, -p, -j, -c,  d,  k,  o,  h,  a, -f, -m}, \
-   { i,  o,  f, -c, -l, -l, -c,  f,  o,  i,  0, -i, -o, -f,  c,  l}, \
-   { k,  k,  0, -k, -k,  0,  k,  k,  0, -k, -k,  0,  k,  k,  0, -k}, \
-   { m,  g, -f, -n, -a,  l,  h, -e, -o, -b,  k,  i, -d, -p, -c,  j}, \
-   { o,  c, -l, -f,  i,  i, -f, -l,  c,  o,  0, -o, -c,  l,  f, -i}, \
-   { p, -a, -o,  b,  n, -c, -m,  d,  l, -e, -k,  f,  j, -g, -i,  h}, \
-   { n, -e, -i,  j,  d, -o,  a,  m, -f, -h,  k,  c, -p,  b,  l, -g}, \
-   { l, -i, -c,  o, -f, -f,  o, -c, -i,  l,  0, -l,  i,  c, -o,  f}, \
-   { j, -m,  c,  g, -p,  f,  d, -n,  i,  a, -k,  l, -b, -h,  o, -e}, \
-   { h, -p,  i, -a, -g,  o, -j,  b,  f, -n,  k, -c, -e,  m, -l,  d}, \
-   { f, -l,  o, -i,  c,  c, -i,  o, -l,  f,  0, -f,  l, -o,  i, -c}, \
-   { d, -h,  l, -p,  m, -i,  e, -a, -c,  g, -k,  o, -n,  j, -f,  b}, \
-   { b, -d,  f, -h,  j, -l,  n, -p,  o, -m,  k, -i,  g, -e,  c, -a}, \
-}
 
 #define DEFINE_DST7_P16_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
 { \
@@ -1024,43 +1572,6 @@ ITRANSFORM(dct, 32);
 }
 
 
-
-#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F}, \
-    {c,  f,  i,  l,  o,  r,  u,  x,  A,  D,  F,  C,  z,  w,  t,  q,  n,  k,  h,  e,  b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E}, \
-    {e,  j,  o,  t,  y,  D,  D,  y,  t,  o,  j,  e,  0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e,  0,  e,  j,  o,  t,  y,  D}, \
-    {g,  n,  u,  B,  D,  w,  p,  i,  b, -e, -l, -s, -z, -F, -y, -r, -k, -d,  c,  j,  q,  x,  E,  A,  t,  m,  f, -a, -h, -o, -v, -C}, \
-    {i,  r,  A,  C,  t,  k,  b, -g, -p, -y, -E, -v, -m, -d,  e,  n,  w,  F,  x,  o,  f, -c, -l, -u, -D, -z, -q, -h,  a,  j,  s,  B}, \
-    {k,  v,  F,  u,  j, -a, -l, -w, -E, -t, -i,  b,  m,  x,  D,  s,  h, -c, -n, -y, -C, -r, -g,  d,  o,  z,  B,  q,  f, -e, -p, -A}, \
-    {m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z}, \
-    {o,  D,  t,  e, -j, -y, -y, -j,  e,  t,  D,  o,  0, -o, -D, -t, -e,  j,  y,  y,  j, -e, -t, -D, -o,  0,  o,  D,  t,  e, -j, -y}, \
-    {q,  E,  n, -c, -t, -B, -k,  f,  w,  y,  h, -i, -z, -v, -e,  l,  C,  s,  b, -o, -F, -p,  a,  r,  D,  m, -d, -u, -A, -j,  g,  x}, \
-    {s,  A,  h, -k, -D, -p,  c,  v,  x,  e, -n, -F, -m,  f,  y,  u,  b, -q, -C, -j,  i,  B,  r, -a, -t, -z, -g,  l,  E,  o, -d, -w}, \
-    {u,  w,  b, -s, -y, -d,  q,  A,  f, -o, -C, -h,  m,  E,  j, -k, -F, -l,  i,  D,  n, -g, -B, -p,  e,  z,  r, -c, -x, -t,  a,  v}, \
-    {w,  s, -d, -A, -o,  h,  E,  k, -l, -D, -g,  p,  z,  c, -t, -v,  a,  x,  r, -e, -B, -n,  i,  F,  j, -m, -C, -f,  q,  y,  b, -u}, \
-    {y,  o, -j, -D, -e,  t,  t, -e, -D, -j,  o,  y,  0, -y, -o,  j,  D,  e, -t, -t,  e,  D,  j, -o, -y,  0,  y,  o, -j, -D, -e,  t}, \
-    {A,  k, -p, -v,  e,  F,  f, -u, -q,  j,  B,  a, -z, -l,  o,  w, -d, -E, -g,  t,  r, -i, -C, -b,  y,  m, -n, -x,  c,  D,  h, -s}, \
-    {C,  g, -v, -n,  o,  u, -h, -B,  a,  D,  f, -w, -m,  p,  t, -i, -A,  b,  E,  e, -x, -l,  q,  s, -j, -z,  c,  F,  d, -y, -k,  r}, \
-    {E,  c, -B, -f,  y,  i, -v, -l,  s,  o, -p, -r,  m,  u, -j, -x,  g,  A, -d, -D,  a,  F,  b, -C, -e,  z,  h, -w, -k,  t,  n, -q}, \
-    {F, -a, -E,  b,  D, -c, -C,  d,  B, -e, -A,  f,  z, -g, -y,  h,  x, -i, -w,  j,  v, -k, -u,  l,  t, -m, -s,  n,  r, -o, -q,  p}, \
-    {D, -e, -y,  j,  t, -o, -o,  t,  j, -y, -e,  D,  0, -D,  e,  y, -j, -t,  o,  o, -t, -j,  y,  e, -D,  0,  D, -e, -y,  j,  t, -o}, \
-    {B, -i, -s,  r,  j, -A, -a,  C, -h, -t,  q,  k, -z, -b,  D, -g, -u,  p,  l, -y, -c,  E, -f, -v,  o,  m, -x, -d,  F, -e, -w,  n}, \
-    {z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m}, \
-    {x, -q, -g,  E, -j, -n,  A, -c, -u,  t,  d, -B,  m,  k, -D,  f,  r, -w, -a,  y, -p, -h,  F, -i, -o,  z, -b, -v,  s,  e, -C,  l}, \
-    {v, -u, -a,  w, -t, -b,  x, -s, -c,  y, -r, -d,  z, -q, -e,  A, -p, -f,  B, -o, -g,  C, -n, -h,  D, -m, -i,  E, -l, -j,  F, -k}, \
-    {t, -y,  e,  o, -D,  j,  j, -D,  o,  e, -y,  t,  0, -t,  y, -e, -o,  D, -j, -j,  D, -o, -e,  y, -t,  0,  t, -y,  e,  o, -D,  j}, \
-    {r, -C,  k,  g, -y,  v, -d, -n,  F, -o, -c,  u, -z,  h,  j, -B,  s, -a, -q,  D, -l, -f,  x, -w,  e,  m, -E,  p,  b, -t,  A, -i}, \
-    {p, -F,  q, -a, -o,  E, -r,  b,  n, -D,  s, -c, -m,  C, -t,  d,  l, -B,  u, -e, -k,  A, -v,  f,  j, -z,  w, -g, -i,  y, -x,  h}, \
-    {n, -B,  w, -i, -e,  s, -F,  r, -d, -j,  x, -A,  m,  a, -o,  C, -v,  h,  f, -t,  E, -q,  c,  k, -y,  z, -l, -b,  p, -D,  u, -g}, \
-    {l, -x,  C, -q,  e,  g, -s,  E, -v,  j,  b, -n,  z, -A,  o, -c, -i,  u, -F,  t, -h, -d,  p, -B,  y, -m,  a,  k, -w,  D, -r,  f}, \
-    {j, -t,  D, -y,  o, -e, -e,  o, -y,  D, -t,  j,  0, -j,  t, -D,  y, -o,  e,  e, -o,  y, -D,  t, -j,  0,  j, -t,  D, -y,  o, -e}, \
-    {h, -p,  x, -F,  y, -q,  i, -a, -g,  o, -w,  E, -z,  r, -j,  b,  f, -n,  v, -D,  A, -s,  k, -c, -e,  m, -u,  C, -B,  t, -l,  d}, \
-    {f, -l,  r, -x,  D, -C,  w, -q,  k, -e, -a,  g, -m,  s, -y,  E, -B,  v, -p,  j, -d, -b,  h, -n,  t, -z,  F, -A,  u, -o,  i, -c}, \
-    {d, -h,  l, -p,  t, -x,  B, -F,  C, -y,  u, -q,  m, -i,  e, -a, -c,  g, -k,  o, -s,  w, -A,  E, -D,  z, -v,  r, -n,  j, -f,  b}, \
-    {b, -d,  f, -h,  j, -l,  n, -p,  r, -t,  v, -x,  z, -B,  D, -F,  E, -C,  A, -y,  w, -u,  s, -q,  o, -m,  k, -i,  g, -e,  c, -a}, \
-}
-
 #define DEFINE_DST7_P32_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
 { \
     {a,  c,  e,  g,  i,  k,  m,  o,  q,  s,  u,  w,  y,  A,  C,  E,  F,  D,  B,  z,  x,  v,  t,  r,  p,  n,  l,  j,  h,  f,  d,  b,},\
@@ -1097,85 +1608,6 @@ ITRANSFORM(dct, 32);
     {F, -E,  D, -C,  B, -A,  z, -y,  x, -w,  v, -u,  t, -s,  r, -q,  p, -o,  n, -m,  l, -k,  j, -i,  h, -g,  f, -e,  d, -c,  b, -a,},\
 }
 
-// DCT-8
-#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \
-{ \
-    {a,  b,  c,  d}, \
-    {b,  0, -b, -b}, \
-    {c, -b, -d,  a}, \
-    {d, -b,  a, -c}, \
-}
-
-#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h}, \
-    {b,  e,  h, -g, -d, -a, -c, -f}, \
-    {c,  h, -e, -a, -f,  g,  b,  d}, \
-    {d, -g, -a, -h,  c,  e, -f, -b}, \
-    {e, -d, -f,  c,  g, -b, -h,  a}, \
-    {f, -a,  g,  e, -b,  h,  d, -c}, \
-    {g, -c,  b, -f, -h,  d, -a,  e}, \
-    {h, -f,  d, -b,  a, -c,  e, -g}, \
-}
-
-#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p}, \
-    {b,  e,  h,  k,  n,  0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n}, \
-    {c,  h,  m, -p, -k, -f, -a, -e, -j, -o,  n,  i,  d,  b,  g,  l}, \
-    {d,  k, -p, -i, -b, -f, -m,  n,  g,  a,  h,  o, -l, -e, -c, -j}, \
-    {e,  n, -k, -b, -h,  0,  h,  b,  k, -n, -e, -e, -n,  k,  b,  h}, \
-    {f,  0, -f, -f,  0,  f,  f,  0, -f, -f,  0,  f,  f,  0, -f, -f}, \
-    {g, -n, -a, -m,  h,  f, -o, -b, -l,  i,  e, -p, -c, -k,  j,  d}, \
-    {h, -k, -e,  n,  b,  0, -b, -n,  e,  k, -h, -h,  k,  e, -n, -b}, \
-    {i, -h, -j,  g,  k, -f, -l,  e,  m, -d, -n,  c,  o, -b, -p,  a}, \
-    {j, -e, -o,  a, -n, -f,  i,  k, -d, -p,  b, -m, -g,  h,  l, -c}, \
-    {k, -b,  n,  h, -e,  0,  e, -h, -n,  b, -k, -k,  b, -n, -h,  e}, \
-    {l, -b,  i,  o, -e,  f, -p, -h,  c, -m, -k,  a, -j, -n,  d, -g}, \
-    {m, -e,  d, -l, -n,  f, -c,  k,  o, -g,  b, -j, -p,  h, -a,  i}, \
-    {n, -h,  b, -e,  k,  0, -k,  e, -b,  h, -n, -n,  h, -b,  e, -k}, \
-    {o, -k,  g, -c,  b, -f,  j, -n, -p,  l, -h,  d, -a,  e, -i,  m}, \
-    {p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o}, \
-}
-
-
-#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
-{ \
-    {a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F}, \
-    {b,  e,  h,  k,  n,  q,  t,  w,  z,  C,  F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D}, \
-    {c,  h,  m,  r,  w,  B,  0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B,  0,  B,  w,  r,  m,  h,  c,  c,  h,  m,  r,  w,  B}, \
-    {d,  k,  r,  y,  F, -A, -t, -m, -f, -b, -i, -p, -w, -D,  C,  v,  o,  h,  a,  g,  n,  u,  B, -E, -x, -q, -j, -c, -e, -l, -s, -z}, \
-    {e,  n,  w,  F, -y, -p, -g, -c, -l, -u, -D,  A,  r,  i,  a,  j,  s,  B, -C, -t, -k, -b, -h, -q, -z,  E,  v,  m,  d,  f,  o,  x}, \
-    {f,  q,  B, -A, -p, -e, -g, -r, -C,  z,  o,  d,  h,  s,  D, -y, -n, -c, -i, -t, -E,  x,  m,  b,  j,  u,  F, -w, -l, -a, -k, -v}, \
-    {g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t}, \
-    {h,  w, -B, -m, -c, -r,  0,  r,  c,  m,  B, -w, -h, -h, -w,  B,  m,  c,  r,  0, -r, -c, -m, -B,  w,  h,  h,  w, -B, -m, -c, -r}, \
-    {i,  z, -w, -f, -l, -C,  t,  c,  o,  F, -q, -a, -r,  E,  n,  d,  u, -B, -k, -g, -x,  y,  h,  j,  A, -v, -e, -m, -D,  s,  b,  p}, \
-    {j,  C, -r, -b, -u,  z,  g,  m,  F, -o, -e, -x,  w,  d,  p, -E, -l, -h, -A,  t,  a,  s, -B, -i, -k, -D,  q,  c,  v, -y, -f, -n}, \
-    {k,  F, -m, -i, -D,  o,  g,  B, -q, -e, -z,  s,  c,  x, -u, -a, -v,  w,  b,  t, -y, -d, -r,  A,  f,  p, -C, -h, -n,  E,  j,  l}, \
-    {l, -E, -h, -p,  A,  d,  t, -w, -a, -x,  s,  e,  B, -o, -i, -F,  k,  m, -D, -g, -q,  z,  c,  u, -v, -b, -y,  r,  f,  C, -n, -j}, \
-    {m, -B, -c, -w,  r,  h,  0, -h, -r,  w,  c,  B, -m, -m,  B,  c,  w, -r, -h,  0,  h,  r, -w, -c, -B,  m,  m, -B, -c, -w,  r,  h}, \
-    {n, -y, -c, -D,  i,  s, -t, -h,  E,  d,  x, -o, -m,  z,  b,  C, -j, -r,  u,  g, -F, -e, -w,  p,  l, -A, -a, -B,  k,  q, -v, -f}, \
-    {o, -v, -h,  C,  a,  D, -g, -w,  n,  p, -u, -i,  B,  b,  E, -f, -x,  m,  q, -t, -j,  A,  c,  F, -e, -y,  l,  r, -s, -k,  z,  d}, \
-    {p, -s, -m,  v,  j, -y, -g,  B,  d, -E, -a, -F,  c,  C, -f, -z,  i,  w, -l, -t,  o,  q, -r, -n,  u,  k, -x, -h,  A,  e, -D, -b}, \
-    {q, -p, -r,  o,  s, -n, -t,  m,  u, -l, -v,  k,  w, -j, -x,  i,  y, -h, -z,  g,  A, -f, -B,  e,  C, -d, -D,  c,  E, -b, -F,  a}, \
-    {r, -m, -w,  h,  B, -c,  0,  c, -B, -h,  w,  m, -r, -r,  m,  w, -h, -B,  c,  0, -c,  B,  h, -w, -m,  r,  r, -m, -w,  h,  B, -c}, \
-    {s, -j, -B,  a, -C, -i,  t,  r, -k, -A,  b, -D, -h,  u,  q, -l, -z,  c, -E, -g,  v,  p, -m, -y,  d, -F, -f,  w,  o, -n, -x,  e}, \
-    {t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g}, \
-    {u, -d,  B,  n, -k, -E,  g, -r, -x,  a, -y, -q,  h, -F, -j,  o,  A, -c,  v,  t, -e,  C,  m, -l, -D,  f, -s, -w,  b, -z, -p,  i}, \
-    {v, -a,  w,  u, -b,  x,  t, -c,  y,  s, -d,  z,  r, -e,  A,  q, -f,  B,  p, -g,  C,  o, -h,  D,  n, -i,  E,  m, -j,  F,  l, -k}, \
-    {w, -c,  r,  B, -h,  m,  0, -m,  h, -B, -r,  c, -w, -w,  c, -r, -B,  h, -m,  0,  m, -h,  B,  r, -c,  w,  w, -c,  r,  B, -h,  m}, \
-    {x, -f,  m, -E, -q,  b, -t, -B,  j, -i,  A,  u, -c,  p,  F, -n,  e, -w, -y,  g, -l,  D,  r, -a,  s,  C, -k,  h, -z, -v,  d, -o}, \
-    {y, -i,  h, -x, -z,  j, -g,  w,  A, -k,  f, -v, -B,  l, -e,  u,  C, -m,  d, -t, -D,  n, -c,  s,  E, -o,  b, -r, -F,  p, -a,  q}, \
-    {z, -l,  c, -q,  E,  u, -g,  h, -v, -D,  p, -b,  m, -A, -y,  k, -d,  r, -F, -t,  f, -i,  w,  C, -o,  a, -n,  B,  x, -j,  e, -s}, \
-    {A, -o,  c, -j,  v,  F, -t,  h, -e,  q, -C, -y,  m, -a,  l, -x, -D,  r, -f,  g, -s,  E,  w, -k,  b, -n,  z,  B, -p,  d, -i,  u}, \
-    {B, -r,  h, -c,  m, -w,  0,  w, -m,  c, -h,  r, -B, -B,  r, -h,  c, -m,  w,  0, -w,  m, -c,  h, -r,  B,  B, -r,  h, -c,  m, -w}, \
-    {C, -u,  m, -e,  d, -l,  t, -B, -D,  v, -n,  f, -c,  k, -s,  A,  E, -w,  o, -g,  b, -j,  r, -z, -F,  x, -p,  h, -a,  i, -q,  y}, \
-    {D, -x,  r, -l,  f, -a,  g, -m,  s, -y,  E,  C, -w,  q, -k,  e, -b,  h, -n,  t, -z,  F,  B, -v,  p, -j,  d, -c,  i, -o,  u, -A}, \
-    {E, -A,  w, -s,  o, -k,  g, -c,  b, -f,  j, -n,  r, -v,  z, -D, -F,  B, -x,  t, -p,  l, -h,  d, -a,  e, -i,  m, -q,  u, -y,  C}, \
-    {F, -D,  B, -z,  x, -v,  t, -r,  p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o,  q, -s,  u, -w,  y, -A,  C, -E}, \
-}
-
-
 // DST-7
 ALIGNED(64) const int16_t uvg_g_dst7_4[4][4] = DEFINE_DST7_P4_MATRIX(29, 55, 74, 84);
 ALIGNED(64) const int16_t uvg_g_dst7_8[8][8] = DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86);
@@ -1309,22 +1741,7 @@ static void mts_dct_16x16_avx2(const int16_t* input, int16_t* output, tr_type_t
 
   const int skip_line = lfnst_idx ? 8 : 0;
   const int skip_line2 = lfnst_idx ? 8 : 0;
-  if (skip_line)
-  {
-    const int reduced_line = 8, cutoff = 8;
-    int16_t* dst2 = output + reduced_line;
-    for (int j = 0; j < cutoff; j++)
-    {
-      memset(dst2, 0, sizeof(int16_t) * skip_line);
-      dst2 += 16;
-    }
-  }
 
-  if (skip_line2)
-  {
-    int16_t* dst2 = output + 16 * 8;
-    memset(dst2, 0, sizeof(int16_t) * 16 * skip_line2);
-  }
 }
 
 /**********/
@@ -1512,21 +1929,7 @@ static void mul_clip_matrix_32x32_mts_avx2(const int16_t* left,
     _mm256_store_si256(dst_v + dst_base + 1, h23);
   }
   
-  if (skip_line)
-  {
-    int16_t* dst2 = dst + reduced_line;
-    for (j = 0; j < cutoff; j++)
-    {
-      memset(dst2, 0, sizeof(int16_t) * skip_line);
-      dst2 += 32;
-    }
-  }
 
-  if (skip_line2)
-  {
-    int16_t* dst2 = dst + 32 * cutoff;
-    memset(dst2, 0, sizeof(int16_t) * 32 * skip_line2);
-  }
 }
 
 static void mts_dct_32x32_avx2(const int16_t* input, int16_t* output, tr_type_t type_hor, tr_type_t type_ver, uint8_t bitdepth, uint8_t lfnst_idx)
@@ -1576,41 +1979,6017 @@ static tr_func* idct_table[5] = {
   mts_idct_4x4_avx2, mts_idct_8x8_avx2, mts_idct_16x16_avx2, mts_idct_32x32_avx2, NULL/*fastInverseDCT2_B64*/
 };
 
+typedef void (dct_full_pass)(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver);
+
+
+// **********************************************
+// New tailored functions for each size combination
+// **********************************************
+
+static void fast_forward_tr_2xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  __m256i    v_coeff_0 = _mm256_load_si256((__m256i*)coeff);
+  __m256i    v_coeff_1 = _mm256_load_si256((__m256i*)(coeff + 16));
+  __m256i* v_dst_ptr = dst;
+
+  const int reduced_line = line - skip_line;
+  // Handle 8 lines at a time (16 samples, 2 samples per line)
+  for (int j = 0; j < reduced_line; j += 8) {
+    // src vector: [00 01 02 03 04 05 06 07|08 09 10 11 12 13 14 15]
+    __m256i     v_src = _mm256_load_si256((const __m256i*) src);
+
+    // Multiply with a and add together all adjacent elements
+    // even vector: [a00+a01 a02+a03 a04+a05 a06+a07|a08+a09 a10+a11 a12+a13 a14+a15]
+    __m256i    v_even = _mm256_madd_epi16(v_src, v_coeff_0);
+    // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15]
+    __m256i     v_odd = _mm256_madd_epi16(v_src, v_coeff_1);
+
+    __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift);
+
+    v_dst_ptr[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    
+    src += 16;
+    v_dst_ptr++;
+  }
+}
+
+static void fast_forward_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 8;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_2x8_coeff_ver;
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_2x8_coeff_ver;
+  }
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_hor_pass_out;
+  fast_forward_tr_2xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got data for only 1 vector
+  // const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x8_shuffle_ver);
+  const __m256i v_src_raw = v_hor_pass_out;
+  // __m256i           v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle);
+  __m256i           v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd[i] = _mm256_madd_epi16(v_src, v_coeff[i]);
+  }
+  __m256i v_hadd_0[4];
+  for (int i = 0; i < 4; ++i) {
+    const int offset = i * 2;
+    v_hadd_0[i] = _mm256_hadd_epi32(v_madd[offset], v_madd[offset + 1]);
+  }
+
+  __m256i v_trunc[2];
+  for (int i = 0; i < 2; ++i) {
+    const int offset = i * 2;
+    v_trunc[i] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[offset], v_hadd_0[offset + 1]), debias, shift_2nd);
+  }
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  const __m256i v_res_shfl = _mm256_load_si256((const __m256i*)ff_dct2_2x8_result_shuffle_ver);
+  // Shuffle values to correct order
+  v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result = _mm256_shuffle_epi32(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result = _mm256_shuffle_epi8(v_result, v_res_shfl);
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+
+static void fast_inverse_tr_2x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_hor);
+
+  const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+
+  __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle);
+  v_src = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]);
+  __m256i v_madd_4 = _mm256_madd_epi16(v_src, v_coeff[4]);
+  __m256i v_madd_5 = _mm256_madd_epi16(v_src, v_coeff[5]);
+  __m256i v_madd_6 = _mm256_madd_epi16(v_src, v_coeff[6]);
+  __m256i v_madd_7 = _mm256_madd_epi16(v_src, v_coeff[7]);
+
+  __m256i v_hadd_00 = _mm256_hadd_epi32(v_madd_0, v_madd_1);
+  __m256i v_hadd_01 = _mm256_hadd_epi32(v_madd_2, v_madd_3);
+  __m256i v_hadd_02 = _mm256_hadd_epi32(v_madd_4, v_madd_5);
+  __m256i v_hadd_03 = _mm256_hadd_epi32(v_madd_6, v_madd_7);
+
+  __m256i v_hadd_10 = _mm256_hadd_epi32(v_hadd_00, v_hadd_01);
+  __m256i v_hadd_11 = _mm256_hadd_epi32(v_hadd_02, v_hadd_03);
+
+  __m256i v_trunc_0 = truncate_avx2(v_hadd_10, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_hadd_11, debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+}
+
+static void fast_inverse_tr_2x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_ver);
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_res_shuffle_ver);
+
+  __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src = _mm256_shuffle_epi8(v_src, v_shuffle);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+
+  __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+static void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x2_coeff_hor;
+  }
+  const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_ver_pass_out;
+  fast_inverse_tr_2x8_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_2x8_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 16;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
+  const int16_t* ver_coeff = &uvg_g_dct_16[0][0];
+  if (ver == DST7) {
+    ver_coeff = &uvg_g_dst7_16[0][0];
+  }
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_hor_pass_out[2];
+  fast_forward_tr_2xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Permute hor pass output to correct order
+  __m256i v_tmp_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i v_tmp_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i v_src_0 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x31);
+
+  const __m256i* v_coeff_ptr = (const __m256i*)ver_coeff;
+
+  __m256i v_madd[2][16];
+  for (int i = 0; i < 16; ++i) {
+    v_madd[0][i] = _mm256_madd_epi16(v_src_0, v_coeff_ptr[i]);
+    v_madd[1][i] = _mm256_madd_epi16(v_src_1, v_coeff_ptr[i]);
+  }
+
+  __m256i v_hadd_0[2][8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_0[0][dst] = _mm256_hadd_epi32(v_madd[0][src], v_madd[0][src + 1]);
+    v_hadd_0[1][dst] = _mm256_hadd_epi32(v_madd[1][src], v_madd[1][src + 1]);
+  }
+
+  __m256i v_hadd_1[2][4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_hadd_1[0][dst] = _mm256_hadd_epi32(v_hadd_0[0][src], v_hadd_0[0][src + 1]);
+    v_hadd_1[1][dst] = _mm256_hadd_epi32(v_hadd_0[1][src], v_hadd_0[1][src + 1]);
+  }
+
+  __m256i v_tmp_00 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x20);
+  __m256i v_tmp_01 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x31);
+  __m256i v_tmp_02 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x20);
+  __m256i v_tmp_03 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x31);
+
+  __m256i v_tmp_10 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x20);
+  __m256i v_tmp_11 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x31);
+  __m256i v_tmp_12 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x20);
+  __m256i v_tmp_13 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x31);
+
+  __m256i v_trunc_00 = truncate_avx2((_mm256_add_epi32(v_tmp_00, v_tmp_01)), debias, shift_2nd);
+  __m256i v_trunc_01 = truncate_avx2((_mm256_add_epi32(v_tmp_02, v_tmp_03)), debias, shift_2nd);
+
+  __m256i v_trunc_10 = truncate_avx2((_mm256_add_epi32(v_tmp_10, v_tmp_11)), debias, shift_2nd);
+  __m256i v_trunc_11 = truncate_avx2((_mm256_add_epi32(v_tmp_12, v_tmp_13)), debias, shift_2nd);
+
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_00, v_trunc_10);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_01, v_trunc_11);
+
+  v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle);
+  v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)&dst[0], v_result_0);
+  _mm256_store_si256((__m256i*)&dst[16], v_result_1);
+}
+
+
+static void fast_inverse_tr_2x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+
+  __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle);
+  __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle);
+
+  v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0[16];
+  __m256i v_madd_1[16];
+  for (int c = 0; c < 16; ++c) {
+    v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_coeff += 2;
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+  }
+
+  __m256i v_hadd_0[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_hadd_1[4];
+  for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+    v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]);
+  }
+
+  __m256i v_trunc[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+}
+
+static void fast_inverse_tr_2x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_lo = _mm256_unpacklo_epi16(src[0], src[1]);
+  __m256i v_src_hi = _mm256_unpackhi_epi16(src[0], src[1]);
+
+  __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]);
+  __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]);
+
+  __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]);
+  __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]);
+
+  __m256i v_trunc_0 = truncate_avx2(v_madd_lo_0, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_madd_lo_1, debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(v_madd_hi_0, debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(v_madd_hi_1, debias, shift);
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+
+  __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result_0);
+  _mm256_store_si256((__m256i*) & dst[16], v_result_1);
+}
+
+static void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x2_coeff_hor;
+  }
+  const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_2x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_2x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
+  const int16_t* ver_coeff = &uvg_g_dct_32[0][0];
+  // For result shuffling, can use existing shuffle vector
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  ALIGNED(32) int16_t v_hor_pass_out[2*32];
+  fast_forward_tr_2xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  __m256i temp_out[4];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int j = 0; j < 2; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ff_dct2_32x32_coeff_ver;
+    const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4);
+    for (int i = 0; i < 16; ++i) {
+
+      __m256i v_src = _mm256_set1_epi32(*temp_source);
+      temp_source += i & 1 ? 3 : 1;
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 2);
+}
+
+
+static void fast_inverse_tr_2x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+
+  __m256i v_src[4];
+  for (int i = 0; i < 4; ++i) {
+    v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle);
+  }
+  for (int i = 0; i < 4; ++i) {
+    v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_add[32];
+  for (int c = 0; c < 32; c++) {
+    const __m256i v_coeff_0 = _mm256_setr_epi64x(c_ptr[0], c_ptr[1], c_ptr[0], c_ptr[1]);
+    const __m256i v_coeff_1 = _mm256_setr_epi64x(c_ptr[2], c_ptr[3], c_ptr[2], c_ptr[3]);
+    const __m256i v_coeff_2 = _mm256_setr_epi64x(c_ptr[4], c_ptr[5], c_ptr[4], c_ptr[5]);
+    const __m256i v_coeff_3 = _mm256_setr_epi64x(c_ptr[6], c_ptr[7], c_ptr[6], c_ptr[7]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    v_add[c] = _mm256_add_epi32(v_add_00, v_add_01);
+    c_ptr += 8;
+  }
+
+  __m256i v_hadd_0[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_hadd_1[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20);
+  dst[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+  dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31);
+}
+
+static void fast_inverse_tr_2x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = src;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  const __m256i v_src_lo0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]);
+  const __m256i v_src_lo1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]);
+  const __m256i v_src_hi0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]);
+  const __m256i v_src_hi1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]);
+
+  __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[0]), debias, shift);
+  __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[1]), debias, shift);
+  __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[0]), debias, shift);
+  __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[1]), debias, shift);
+  __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[0]), debias, shift);
+  __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[1]), debias, shift);
+  __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[0]), debias, shift);
+  __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[1]), debias, shift);
+
+  __m256i v_result[4];
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01), v_res_shuffle);
+  v_tmp[1] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11), v_res_shuffle);
+  v_tmp[2] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01), v_res_shuffle);
+  v_tmp[3] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11), v_res_shuffle);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 2;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; // rename
+  const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename
+  // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_2x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_2x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+  
+}
+
+
+static void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+  const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & coeff[32]);
+  const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & coeff[48]);
+
+  const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0);
+  const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1);
+
+  const int reduced_line = line - skip_line;
+  // Handle 4 lines at a time (16 samples, 4 samples per line)
+  for (int j = 0; j < reduced_line; j += 4) {
+    //                  line 0          line 1            line 2          line 3
+    // src vector:     [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15]
+    __m256i v_src_raw = _mm256_load_si256((const __m256i*) src);
+
+    // Arrange data for column-wise calculation. Data and coeffs are ordered so no further shuffling
+    // or permutes are needed.
+    // vec 1 : [s00 s01 s04 s05 s08 s09 s12 s13 | s00 s01 s04 s05 s08 s09 s12 s13]
+    // vec 2 : [s02 s03 s06 s07 s10 s11 s14 s15 | s02 s03 s06 s07 s10 s11 s14 s15]
+    __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0);
+    __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3);
+
+
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    src += 16;
+    dst += 1;
+  }
+}
+
+static void fast_forward_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 4;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  // TODO: coeffs for DST7 and DCT8
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = fast_forward_dct2_b4_coeff;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  }
+  else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = fast_forward_dst7_b4_coeff;
+  }
+  else if (ver == DCT8) {
+    ver_coeff = fast_forward_dct8_b4_coeff;
+  }
+
+  __m256i v_hor_pass_out;
+  fast_forward_tr_4xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & ver_coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & ver_coeff[16]);
+  const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & ver_coeff[32]);
+  const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & ver_coeff[48]);
+
+  const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0);
+  const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1);
+
+  __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_0);
+  __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_1);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+  __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd);
+  __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+
+static void fast_inverse_tr_4x4_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_shuffle_hor);
+
+  const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+  __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle);
+  v_src         = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src         = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+}
+
+static void fast_inverse_tr_4x4_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_result_shuffle_ver);
+
+  __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src         = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+  __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]);
+  __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  v_result         = _mm256_shuffle_epi8(v_result, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+static void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 4;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* hor_coeff = fi_dct2_4xN_coeff_hor;
+  const int16_t* ver_coeff = fi_dct2_4xN_coeff_hor; // Can use same table for both passes
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_4xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_4xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4xN_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4xN_coeff_hor;
+  }
+
+  __m256i v_hor_pass_out;
+  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
+
+  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
+}
+
+
+static void fast_forward_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = ff_dct2_4x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  } else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_4x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_4x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[2];
+  fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+  
+  __m256i v_madd[2][8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff[0]);
+    v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff[1]);
+    v_coeff += 2;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]);
+  }
+
+  __m256i v_trunc[4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd);
+  }
+
+  __m256i v_result[2];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  // Order results
+  v_result[0] = _mm256_permute4x64_epi64(v_result[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_result[1] = _mm256_permute4x64_epi64(v_result[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+  v_result[0] = _mm256_shuffle_epi32(v_result[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_result[1] = _mm256_shuffle_epi32(v_result[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*)&dst[0],  v_result[0]);
+  _mm256_store_si256((__m256i*)&dst[16], v_result[1]);
+}
+
+
+static void fast_inverse_tr_4x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+  const __m256i v_permute = _mm256_load_si256((const __m256i*)permute_32b_0415);
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle);
+  __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle);
+  v_src_0 = _mm256_permutevar8x32_epi32(v_src_0, v_permute);
+  v_src_1 = _mm256_permutevar8x32_epi32(v_src_1, v_permute);
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+
+  __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_madd_04 = _mm256_madd_epi16(v_src_0, v_coeff[8]);
+  __m256i v_madd_14 = _mm256_madd_epi16(v_src_1, v_coeff[9]);
+
+  __m256i v_madd_05 = _mm256_madd_epi16(v_src_0, v_coeff[10]);
+  __m256i v_madd_15 = _mm256_madd_epi16(v_src_1, v_coeff[11]);
+
+  __m256i v_madd_06 = _mm256_madd_epi16(v_src_0, v_coeff[12]);
+  __m256i v_madd_16 = _mm256_madd_epi16(v_src_1, v_coeff[13]);
+
+  __m256i v_madd_07 = _mm256_madd_epi16(v_src_0, v_coeff[14]);
+  __m256i v_madd_17 = _mm256_madd_epi16(v_src_1, v_coeff[15]);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11);
+  __m256i v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12);
+  __m256i v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13);
+  __m256i v_add_4 = _mm256_add_epi32(v_madd_04, v_madd_14);
+  __m256i v_add_5 = _mm256_add_epi32(v_madd_05, v_madd_15);
+  __m256i v_add_6 = _mm256_add_epi32(v_madd_06, v_madd_16);
+  __m256i v_add_7 = _mm256_add_epi32(v_madd_07, v_madd_17);
+
+  __m256i v_hadd_0 = _mm256_hadd_epi32(v_add_0, v_add_1);
+  __m256i v_hadd_1 = _mm256_hadd_epi32(v_add_2, v_add_3);
+  __m256i v_hadd_2 = _mm256_hadd_epi32(v_add_4, v_add_5);
+  __m256i v_hadd_3 = _mm256_hadd_epi32(v_add_6, v_add_7);
+
+  __m256i v_trunc_0 = truncate_avx2(v_hadd_0, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_hadd_1, debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(v_hadd_2, debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(v_hadd_3, debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+}
+
+static void fast_inverse_tr_4x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31);
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+
+  __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift);
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+  __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+
+  v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle);
+  v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle);
+
+  v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  v_result_0 = _mm256_shuffle_epi32(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_shuffle_epi32(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result_0);
+  _mm256_store_si256((__m256i*) & dst[16], v_result_1);
+}
+
+static void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x4_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_8x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_8x4_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_8x4_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x4_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x4_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_4x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_4x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = &uvg_g_dct_16[0][0];
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  } else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = &uvg_g_dst7_16[0][0];
+  } else if (ver == DCT8) {
+    ver_coeff = &uvg_g_dct8_16[0][0];
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+  const int64_t* coeff_ptr = (const int64_t*)ver_coeff; // Read four coeffs at once by casting into 64 bit integer
+
+  __m256i v_madd[4][16];
+  for (int i = 0; i < 16; ++i) {
+    const __m256i v_coeff_0 = _mm256_set1_epi64x(coeff_ptr[0]);
+    const __m256i v_coeff_1 = _mm256_set1_epi64x(coeff_ptr[1]);
+    const __m256i v_coeff_2 = _mm256_set1_epi64x(coeff_ptr[2]);
+    const __m256i v_coeff_3 = _mm256_set1_epi64x(coeff_ptr[3]);
+    v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff_0);
+    v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff_1);
+    v_madd[2][i] = _mm256_madd_epi16(v_hor_pass_out[2], v_coeff_2);
+    v_madd[3][i] = _mm256_madd_epi16(v_hor_pass_out[3], v_coeff_3);
+    coeff_ptr += 4;
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    __m256i v_tmp0 = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]);
+    __m256i v_tmp1 = _mm256_add_epi32(v_madd[2][i], v_madd[3][i]);
+
+    v_add[i] = _mm256_add_epi32(v_tmp0, v_tmp1);
+  }
+
+  __m256i v_trunc[8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd);
+  }
+
+  __m256i v_result[4];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  for (int i = 0; i < 4; ++i) {
+    v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    v_result[i] = _mm256_shuffle_epi32(v_result[i], _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_4x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle);
+  __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle);
+  __m256i v_src_2 = _mm256_shuffle_epi8(v_src_raw[2], v_shuffle);
+  __m256i v_src_3 = _mm256_shuffle_epi8(v_src_raw[3], v_shuffle);
+
+  v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_2 = _mm256_permute4x64_epi64(v_src_2, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_3 = _mm256_permute4x64_epi64(v_src_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+  v_src_0 = _mm256_shuffle_epi32(v_src_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_1 = _mm256_shuffle_epi32(v_src_1, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_2 = _mm256_shuffle_epi32(v_src_2, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src_3 = _mm256_shuffle_epi32(v_src_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_madd_0[16];
+  __m256i v_madd_1[16];
+  __m256i v_madd_2[16];
+  __m256i v_madd_3[16];
+  for (int c = 0; c < 16; c++) {
+    v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+    v_coeff += 4;
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+    v_add[i] = _mm256_add_epi32(v_add_0, v_add_1);
+  }
+
+  __m256i v_hadd[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+}
+
+static void fast_inverse_tr_4x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31);
+  __m256i v_src_2 = _mm256_permute2x128_si256(src[2], src[3], 0x20);
+  __m256i v_src_3 = _mm256_permute2x128_si256(src[2], src[3], 0x31);
+
+  __m256i v_madd_0[4];
+  __m256i v_madd_1[4];
+  __m256i v_madd_2[4];
+  __m256i v_madd_3[4];
+  for (int c = 0; c < 4; ++c) {
+    v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[0]);
+    v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[1]);
+    v_coeff += 2;
+  }
+
+  __m256i v_trunc_0[4];
+  __m256i v_trunc_1[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc_0[i] = truncate_avx2(_mm256_add_epi32(v_madd_0[i], v_madd_1[i]), debias, shift);
+    v_trunc_1[i] = truncate_avx2(_mm256_add_epi32(v_madd_2[i], v_madd_3[i]), debias, shift);
+  }
+
+  __m256i v_result[4];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+
+  __m256i v_tmp32_0 = _mm256_unpacklo_epi32(v_tmp0, v_tmp1);
+  __m256i v_tmp32_1 = _mm256_unpackhi_epi32(v_tmp0, v_tmp1);
+  __m256i v_tmp32_2 = _mm256_unpacklo_epi32(v_tmp2, v_tmp3);
+  __m256i v_tmp32_3 = _mm256_unpackhi_epi32(v_tmp2, v_tmp3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_16x4_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_16x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x4_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x4_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x4_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_16x4_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_4x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_4x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
+  const int16_t* ver_coeff = ff_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fast_forward_dst7_b4_coeff;
+  } else if (hor == DCT8) {
+    hor_coeff = fast_forward_dct8_b4_coeff;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_4x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_4x32_coeff_ver;
+  }
+
+  int16_t v_hor_pass_out[4*32];
+  fast_forward_tr_4xN_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+
+  __m256i temp_out[8];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int j = 0; j < 4; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4);
+    for (int i = 0; i < 16; ++i) {
+
+      __m256i v_src = _mm256_set1_epi32(*temp_source);
+      temp_source += i & 1 ? 7 : 1;
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 4);
+
+
+}
+
+
+static void fast_inverse_tr_4x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle);
+  }
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_src[i] = _mm256_shuffle_epi32(v_src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_add[32];
+  for (int c = 0; c < 32; c++) {
+    __m256i v_madd[8];
+    for (int i = 0; i < 8; ++i) {
+      const __m256i v_coeff = _mm256_set1_epi64x(*c_ptr);
+      v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff);
+      c_ptr++;
+    }
+
+    __m256i v_add_0[4];
+    for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+      v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]);
+    }
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]);
+
+    v_add[c] = _mm256_add_epi32(v_add_10, v_add_11);
+  }
+
+  __m256i v_hadd[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]);
+  }
+
+  __m256i v_trunc[16];
+  for (int i = 0; i < 16; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+  }
+  // TODO: cutoff for dct8 and dst7
+}
+
+static void fast_inverse_tr_4x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = src;
+
+  __m256i v_src[8];
+  __m256i v_tmp[8];
+  v_src[0] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x31);
+  v_src[2] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x20);
+  v_src[3] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x31);
+  v_src[4] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x20);
+  v_src[5] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x31);
+  v_src[6] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x20);
+  v_src[7] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x31);
+
+  for (int d = 0, c = 0; c < 4; ++c, d += 2) {
+    __m256i v_madd_00 = _mm256_madd_epi16(v_src[0], v_coeff[0]);
+    __m256i v_madd_01 = _mm256_madd_epi16(v_src[1], v_coeff[1]);
+    __m256i v_madd_10 = _mm256_madd_epi16(v_src[2], v_coeff[0]);
+    __m256i v_madd_11 = _mm256_madd_epi16(v_src[3], v_coeff[1]);
+    __m256i v_madd_20 = _mm256_madd_epi16(v_src[4], v_coeff[0]);
+    __m256i v_madd_21 = _mm256_madd_epi16(v_src[5], v_coeff[1]);
+    __m256i v_madd_30 = _mm256_madd_epi16(v_src[6], v_coeff[0]);
+    __m256i v_madd_31 = _mm256_madd_epi16(v_src[7], v_coeff[1]);
+    v_coeff += 2;
+
+    __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_01), debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_10, v_madd_11), debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_20, v_madd_21), debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_30, v_madd_31), debias, shift);
+
+    v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+    v_tmp[d + 0] = _mm256_permute4x64_epi64(v_tmp[d + 0], _MM_SHUFFLE(3, 1, 2, 0));
+    v_tmp[d + 1] = _mm256_permute4x64_epi64(v_tmp[d + 1], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_result[8];
+  transpose_avx2(v_tmp, v_result, 32, 4);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 4;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
+  const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename
+  }
+  if (ver == DST7) {
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
+  } else if (ver == DCT8) {
+    ver_coeff = &uvg_g_dct8_32[0][0];
+  }
+
+  __m256i v_ver_pass_out[8];
+  fast_inverse_tr_4x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_4x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_8xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const int reduced_line = line - skip_line;
+  // Handle 2 lines at a time (16 samples, 8 samples per line)
+  for (int j = 0; j < reduced_line; j += 2) {
+    //                    line 1                    line 2
+    // src vector:       [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7]
+    __m256i    v_src = _mm256_load_si256((const __m256i*)src);
+
+    // Rearrange source in a way samples can be added together column-wise using add
+    // after first round of madd operations.
+    // Need 4 source vectors arranged as follows. High 128 lanes are the same as low:
+    // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...]
+    // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...]
+    // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...]
+    // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...]
+
+    __m256i  v_src_0 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i  v_src_1 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i  v_src_2 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i  v_src_3 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 3, 3, 3));
+
+    // Lane 1
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+
+    // Lane 2
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]);
+
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    // Trunc results from both lanes
+    __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    src += 16;
+    dst += 1;
+  }
+}
+
+static void fast_forward_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 2;
+  
+  int skip_width  = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x2_coeff_ver;
+  // Only DCT2 is defined for 8x2 block
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+
+  __m256i v_hor_pass_out;
+  fast_forward_tr_8xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // TODO: coeffs for DST7 and DCT8 transforms
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x2_ver_pass_shuffle);
+
+  // 8x2, only 16 samples, handle all at once
+  __m256i v_src_per = _mm256_permute4x64_epi64(v_hor_pass_out, _MM_SHUFFLE(3, 1, 2, 0));
+  // Weave lo and hi halfs of each 128 bit lane
+  __m256i     v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle);
+  //            v_src = _mm256_unpackhi_epi16(v_src_raw, v_src_swp);
+
+  __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]);
+  __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]);
+
+  __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift_2nd);
+  __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift_2nd);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+          v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); // TODO: this permute can probably be optimized away
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+
+static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_2x8_shuffle_hor);
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+
+  // Got data for one vector
+  const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+
+  __m256i v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0));
+  v_src = _mm256_shuffle_epi8(v_src, v_shuffle);
+
+  __m256i    v_even = _mm256_madd_epi16(v_src, v_coeff_0);
+  // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15]
+  __m256i     v_odd = _mm256_madd_epi16(v_src, v_coeff_1);
+
+  __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+}
+
+static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver);
+  const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver);
+
+  // Duplicate sources to enable vertical addition
+  __m256i v_src_0 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(1, 1, 0, 0));
+  __m256i v_src_1 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(3, 3, 2, 2));
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+  
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_20 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_21 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  
+  __m256i v_madd_30 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_31 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_01);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_10, v_madd_11);
+  __m256i v_add_2 = _mm256_add_epi32(v_madd_20, v_madd_21);
+  __m256i v_add_3 = _mm256_add_epi32(v_madd_30, v_madd_31);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift);
+
+  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
+  //v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
+
+  _mm256_store_si256((__m256i*)dst, v_result);
+}
+
+static void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 2;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_2x8_coeff_ver;
+  }
+  // Only dct2 transform is defined for this block size
+
+  __m256i v_ver_pass_out;
+  fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+static void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 4;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x4_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x4_coeff_ver;
+  }
+  
+  __m256i v_hor_pass_out[2];
+  fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i     v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle);
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle);
+  const __m256i*      v_coeff = (const __m256i*)ver_coeff;
+
+  // 32 samples, process in two steps
+  __m256i v_src_per_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i v_src_per_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0));
+  // Weave lo and hi halfs of each 128 bit lane
+  __m256i     v_src_0 = _mm256_shuffle_epi8(v_src_per_0, v_shuffle);
+  __m256i     v_src_1 = _mm256_shuffle_epi8(v_src_per_1, v_shuffle);
+
+  __m256i   v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i   v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i   v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i   v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+
+  __m256i   v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+  __m256i   v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+  __m256i   v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  __m256i   v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i     v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10);
+  __m256i     v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11);
+  __m256i     v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12);
+  __m256i     v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13);
+
+  __m256i   v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd);
+  __m256i   v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd);
+  __m256i   v_trunc_2 = truncate_avx2(v_add_2, debias, shift_2nd);
+  __m256i   v_trunc_3 = truncate_avx2(v_add_3, debias, shift_2nd);
+           
+  __m256i  v_result_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  __m256i  v_result_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+  // Swap each middle 64 bit chunk in both 128 bit lanes
+  v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  // Swap each middle 16 bit value in each 64 bit chunk
+  v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle);
+  v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle);
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+}
+
+
+static void fast_inverse_tr_8x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i v_src_raw_0 = _mm256_load_si256((const __m256i*) & src[0]);
+  const __m256i v_src_raw_1 = _mm256_load_si256((const __m256i*) & src[16]);
+
+  __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw_0, v_src_raw_1);
+  __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw_0, v_src_raw_1);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x31);
+
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[1]);
+  __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[3]);
+
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[4]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[6]);
+  __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+}
+
+static void fast_inverse_tr_8x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver);
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31);
+
+  __m256i v_madd_0[8];
+  __m256i v_madd_1[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+
+    v_coeff += 2;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+  }
+
+  __m256i v_hadd[4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  __m256i v_result[2];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle);
+  v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle);
+
+  __m256i v_tmp0 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x20);
+  __m256i v_tmp1 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x31);
+
+  v_result[0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result[1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result[0]);
+  _mm256_store_si256((__m256i*) & dst[16], v_result[1]);
+}
+
+static void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 4;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_4x8_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_4x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_4x8_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_4x8_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4x8_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4x8_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_8x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_8x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 8;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  const int32_t* coeff_ptr = (const int32_t*)ver_coeff; // Cast into 32 bit integer to read two coeffs at a time
+
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+
+  __m256i v_trunc[8];
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x31);
+  __m256i v_src_2 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x20);
+  __m256i v_src_3 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_coeff_0 = _mm256_set1_epi32(coeff_ptr[0]);
+    __m256i v_coeff_1 = _mm256_set1_epi32(coeff_ptr[1]);
+    __m256i v_coeff_2 = _mm256_set1_epi32(coeff_ptr[2]);
+    __m256i v_coeff_3 = _mm256_set1_epi32(coeff_ptr[3]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff_3);
+
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    v_trunc[i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift_2nd);
+    coeff_ptr += 4;
+  }
+
+  __m256i v_result[4];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  for (int i = 0; i < 4; ++i) {
+    v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+  
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_8x8_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src[4];
+  v_src[0] = _mm256_permute4x64_epi64(v_src_raw[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[1] = _mm256_permute4x64_epi64(v_src_raw[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[2] = _mm256_permute4x64_epi64(v_src_raw[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[3] = _mm256_permute4x64_epi64(v_src_raw[3], _MM_SHUFFLE(3, 1, 2, 0));
+  
+  v_src[0] = _mm256_shuffle_epi8(v_src[0], v_shuffle);
+  v_src[1] = _mm256_shuffle_epi8(v_src[1], v_shuffle);
+  v_src[2] = _mm256_shuffle_epi8(v_src[2], v_shuffle);
+  v_src[3] = _mm256_shuffle_epi8(v_src[3], v_shuffle);
+
+  const __m256i* v_c_ptr = v_coeff;
+  __m256i v_madd_0[8];
+  __m256i v_madd_1[8];
+  __m256i v_madd_2[8];
+  __m256i v_madd_3[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]);
+    v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]);
+    v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]);
+    v_c_ptr += 4;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+    v_add[i] = _mm256_add_epi32(v_add_0, v_add_1);
+  }
+  
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_add[i], debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+}
+
+static void fast_inverse_tr_8x8_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src[4];
+  v_src[0] = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[1] = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[2] = _mm256_shuffle_epi32(src[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_src[3] = _mm256_shuffle_epi32(src[3], _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i v_tmp0 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20);
+  __m256i v_tmp1 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31);
+  __m256i v_tmp2 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20);
+  __m256i v_tmp3 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31);
+
+  v_src[0] = _mm256_unpacklo_epi64(v_tmp0, v_tmp2);
+  v_src[1] = _mm256_unpackhi_epi64(v_tmp0, v_tmp2);
+  v_src[2] = _mm256_unpacklo_epi64(v_tmp1, v_tmp3);
+  v_src[3] = _mm256_unpackhi_epi64(v_tmp1, v_tmp3);
+  
+
+  const __m256i* v_c_ptr = v_coeff;
+  __m256i v_madd_0[8];
+  __m256i v_madd_1[8];
+  __m256i v_madd_2[8];
+  __m256i v_madd_3[8];
+  for (int i = 0; i < 8; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]);
+    v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]);
+    v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]);
+    v_c_ptr += 4;
+  }
+
+  __m256i v_add[8];
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+    v_add[i] = _mm256_add_epi32(v_add_0, v_add_1);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_add[i], debias, shift);
+  }
+
+  __m256i v_result[4];
+  v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle);
+  v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle);
+  v_result[2] = _mm256_shuffle_epi8(v_result[2], v_res_shuffle);
+  v_result[3] = _mm256_shuffle_epi8(v_result[3], v_res_shuffle);
+
+  __m256i v_rtmp0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp1 = _mm256_unpackhi_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp2 = _mm256_unpacklo_epi32(v_result[2], v_result[3]);
+  __m256i v_rtmp3 = _mm256_unpackhi_epi32(v_result[2], v_result[3]);
+
+  __m256i v_tmp20 = _mm256_unpacklo_epi64(v_rtmp0, v_rtmp2);
+  __m256i v_tmp21 = _mm256_unpackhi_epi64(v_rtmp0, v_rtmp2);
+  __m256i v_tmp22 = _mm256_unpacklo_epi64(v_rtmp1, v_rtmp3);
+  __m256i v_tmp23 = _mm256_unpackhi_epi64(v_rtmp1, v_rtmp3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* hor_coeff = fi_dct2_8x8_coeff_hor;
+  const int16_t* ver_coeff = fi_dct2_8x8_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_8x8_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_8x8_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x8_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x8_coeff_hor;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
+
+  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
+}
+
+
+static void fast_forward_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 16;
+  // TODO: might be able to get rid of skips in these tailored solutions
+  int skip_width = 0;
+  int skip_height = 0; // This is not used anywhere
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[8];
+  fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Can use same shuffles as 8x4
+  const __m256i     v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle);
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle);
+  //const __m256i* v_coeff = (const __m256i*)ver_coeff;
+  const int32_t *line_coeff = (const int32_t*)ver_coeff;
+
+  // Multiply+add all source vectors with coeff vectors
+  __m256i v_madd[8][16];
+  __m256i* v_src_ptr = v_hor_pass_out;
+  for (int i = 0; i < 8; ++i) {
+    __m256i v_src_per = _mm256_permute4x64_epi64(v_src_ptr[0], _MM_SHUFFLE(3, 1, 2, 0));
+    // Weave lo and hi halfs of each 128 bit lane
+    __m256i     v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle);
+    
+    for (int ii = 0; ii < 16; ++ii) {
+      //int coeff_row = ii * 8 + i;
+      const int32_t coeff = line_coeff[ii];
+      const __m256i v_coeff = _mm256_set1_epi32(coeff);
+      v_madd[i][ii] = _mm256_madd_epi16(v_src, v_coeff);
+    }
+    line_coeff += 16;
+    v_src_ptr += 1;
+  }
+
+  // Add vectors
+  __m256i v_add_0[4][16];
+  for (int i = 0; i < 4; ++i) {
+    for (int ii = 0; ii < 16; ++ii) {
+      int offset = i * 2;
+      v_add_0[i][ii] = _mm256_add_epi32(v_madd[offset][ii], v_madd[offset + 1][ii]);
+    }
+  }
+  // Second round of additions
+  __m256i v_add_1[2][16];
+  for (int i = 0; i < 2; ++i) {
+    for (int ii = 0; ii < 16; ++ii) {
+      int offset = i * 2;
+      v_add_1[i][ii] = _mm256_add_epi32(v_add_0[offset][ii], v_add_0[offset + 1][ii]);
+    }
+  }
+  // Third round of additions
+  __m256i v_trunc[16];
+  for (int ii = 0; ii < 16; ++ii) {
+    v_trunc[ii] = _mm256_add_epi32(v_add_1[0][ii], v_add_1[1][ii]);
+    v_trunc[ii] = truncate_avx2(v_trunc[ii], debias, shift_2nd);
+  }
+
+
+  for (int i = 0; i < 16; i += 2) {
+    __m256i v_result = _mm256_packs_epi32(v_trunc[i], v_trunc[i + 1]);
+
+    // Swap each middle 64 bit chunk in both 128 bit lanes
+    v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+    // Swap each middle 16 bit value in each 64 bit chunk
+    v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle);
+    
+    _mm256_store_si256((__m256i*)dst, v_result);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_8x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+  const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_tmp[8];
+  for (int i = 0; i < 8; ++i) {
+    v_tmp[i] = _mm256_permute4x64_epi64(v_src_raw[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_shuffle_epi8(v_tmp[i], v_shuffle);
+  }
+
+  __m256i v_trunc[16];
+  for (int c = 0; c < 16; c++) {
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]);
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff[4]);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff[5]);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff[6]);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff[7]);
+
+    v_coeff += 8;
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift);
+  }
+
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+  }
+}
+
+static void fast_inverse_tr_8x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = _mm256_shuffle_epi32(src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_tmp[8];
+  v_tmp[0] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20);
+  v_tmp[1] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20);
+  v_tmp[2] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x20);
+  v_tmp[3] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x20);
+  v_tmp[4] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31);
+  v_tmp[5] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31);
+  v_tmp[6] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x31);
+  v_tmp[7] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x31);
+
+  v_src[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+  v_src[1] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+  v_src[2] = _mm256_unpacklo_epi32(v_tmp[4], v_tmp[5]);
+  v_src[3] = _mm256_unpackhi_epi32(v_tmp[4], v_tmp[5]);
+  v_src[4] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+  v_src[5] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+  v_src[6] = _mm256_unpacklo_epi32(v_tmp[6], v_tmp[7]);
+  v_src[7] = _mm256_unpackhi_epi32(v_tmp[6], v_tmp[7]);
+
+  __m256i v_trunc[2][8];
+  for (int d = 0, s = 0; d < 2; ++d, s += 4) {
+    const __m256i* v_c_ptr = v_coeff;
+    __m256i v_madd_0[8];
+    __m256i v_madd_1[8];
+    __m256i v_madd_2[8];
+    __m256i v_madd_3[8];
+    for (int c = 0; c < 8; ++c) {
+      v_madd_0[c] = _mm256_madd_epi16(v_src[s + 0], v_c_ptr[0]);
+      v_madd_1[c] = _mm256_madd_epi16(v_src[s + 1], v_c_ptr[1]);
+      v_madd_2[c] = _mm256_madd_epi16(v_src[s + 2], v_c_ptr[2]);
+      v_madd_3[c] = _mm256_madd_epi16(v_src[s + 3], v_c_ptr[3]);
+      v_c_ptr += 4;
+    }
+
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+      __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+      v_trunc[d][i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift);
+    }
+  }
+
+  __m256i v_rtmp[8];
+  v_rtmp[0] = _mm256_packs_epi32(v_trunc[0][0], v_trunc[0][1]);
+  v_rtmp[1] = _mm256_packs_epi32(v_trunc[0][2], v_trunc[0][3]);
+  v_rtmp[2] = _mm256_packs_epi32(v_trunc[0][4], v_trunc[0][5]);
+  v_rtmp[3] = _mm256_packs_epi32(v_trunc[0][6], v_trunc[0][7]);
+  v_rtmp[4] = _mm256_packs_epi32(v_trunc[1][0], v_trunc[1][1]);
+  v_rtmp[5] = _mm256_packs_epi32(v_trunc[1][2], v_trunc[1][3]);
+  v_rtmp[6] = _mm256_packs_epi32(v_trunc[1][4], v_trunc[1][5]);
+  v_rtmp[7] = _mm256_packs_epi32(v_trunc[1][6], v_trunc[1][7]);
+
+  for (int i = 0; i < 8; ++i) {
+    v_rtmp[i] = _mm256_shuffle_epi8(v_rtmp[i], v_res_shuffle);
+  }
+
+  __m256i v_tmp32_lo0 = _mm256_unpacklo_epi32(v_rtmp[0], v_rtmp[1]);
+  __m256i v_tmp32_lo1 = _mm256_unpacklo_epi32(v_rtmp[2], v_rtmp[3]);
+  __m256i v_tmp32_lo2 = _mm256_unpacklo_epi32(v_rtmp[4], v_rtmp[5]);
+  __m256i v_tmp32_lo3 = _mm256_unpacklo_epi32(v_rtmp[6], v_rtmp[7]);
+
+  __m256i v_tmp32_hi0 = _mm256_unpackhi_epi32(v_rtmp[0], v_rtmp[1]);
+  __m256i v_tmp32_hi1 = _mm256_unpackhi_epi32(v_rtmp[2], v_rtmp[3]);
+  __m256i v_tmp32_hi2 = _mm256_unpackhi_epi32(v_rtmp[4], v_rtmp[5]);
+  __m256i v_tmp32_hi3 = _mm256_unpackhi_epi32(v_rtmp[6], v_rtmp[7]);
+
+  __m256i v_tmp64_lo0 = _mm256_unpacklo_epi64(v_tmp32_lo0, v_tmp32_lo1);
+  __m256i v_tmp64_lo1 = _mm256_unpacklo_epi64(v_tmp32_hi0, v_tmp32_hi1);
+  __m256i v_tmp64_lo2 = _mm256_unpacklo_epi64(v_tmp32_lo2, v_tmp32_lo3);
+  __m256i v_tmp64_lo3 = _mm256_unpacklo_epi64(v_tmp32_hi2, v_tmp32_hi3);
+
+  __m256i v_tmp64_hi0 = _mm256_unpackhi_epi64(v_tmp32_lo0, v_tmp32_lo1);
+  __m256i v_tmp64_hi1 = _mm256_unpackhi_epi64(v_tmp32_hi0, v_tmp32_hi1);
+  __m256i v_tmp64_hi2 = _mm256_unpackhi_epi64(v_tmp32_lo2, v_tmp32_lo3);
+  __m256i v_tmp64_hi3 = _mm256_unpackhi_epi64(v_tmp32_hi2, v_tmp32_hi3);
+
+  __m256i v_result[8];
+  v_result[0] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x31);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x31);
+  v_result[4] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x20);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x20);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_16x8_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_16x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x8_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x8_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x8_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_16x8_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[8];
+  fast_inverse_tr_8x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_8x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_8x32_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_8xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_8xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_8x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_8x32_coeff_ver;
+  }
+
+  ALIGNED(32) int16_t v_hor_pass_out[8 * 32];
+  fast_forward_tr_8xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  __m256i temp_out[16];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int j = 0; j < 8; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    for (int i = 0; i < 16; ++i) {
+      int16_t source[2];
+      source[0] = v_hor_pass_out[j + i * 16];
+      source[1] = v_hor_pass_out[j + i * 16 + 8];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 8);
+#undef NUM_PARTS
+#undef PART_DIMENSION
+
+}
+
+
+static void fast_inverse_tr_8x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_tmp[16];
+  for (int i = 0; i < 16; i += 2) {
+    v_tmp[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20);
+    v_tmp[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31);
+  }
+
+  __m256i v_tmp16_lo[8];
+  __m256i v_tmp16_hi[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_tmp[s + 0], v_tmp[s + 1]);
+    v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_tmp[s + 0], v_tmp[s + 1]);
+  }
+
+  __m256i v_src[16];
+  for (int d = 0, s = 0; d < 16; d += 2, ++s) {
+    v_src[d + 0] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x20);
+    v_src[d + 1] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x31);
+  }
+
+  __m256i v_trunc[32];
+
+  for (int row = 0; row < 32; ++row) {
+    __m256i v_res = _mm256_setzero_si256();
+    for (int i = 0; i < 16; ++i) {
+      __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+      __m256i v_madd = _mm256_madd_epi16(v_src[i], v_coeff);
+      v_res = _mm256_add_epi32(v_res, v_madd);
+      c_ptr++;
+    }
+
+    v_trunc[row] = truncate_avx2(v_res, debias, shift);
+  }
+
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+  }
+}
+
+static void fast_inverse_tr_8x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  const __m256i* v_src_raw = src;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246);
+
+  __m256i v_src[16];
+  for (int i = 0; i < 16; i += 2) {
+    v_src[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20);
+    v_src[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31);
+  }
+
+  __m256i v_tmp[16];
+  for (int s = 0; s < 16; s += 2) {
+    __m256i v_add[8];
+    for (int d = 0, c = 0; d < 8; ++d, c += 2) {
+      __m256i v_madd_0 = _mm256_madd_epi16(v_src[s + 0], v_coeff[c + 0]);
+      __m256i v_madd_1 = _mm256_madd_epi16(v_src[s + 1], v_coeff[c + 1]);
+
+      v_add[d] = _mm256_add_epi32(v_madd_0, v_madd_1);
+    }
+
+    __m256i v_hadd[4];
+    v_hadd[0] = _mm256_hadd_epi32(v_add[0], v_add[1]);
+    v_hadd[1] = _mm256_hadd_epi32(v_add[2], v_add[3]);
+    v_hadd[2] = _mm256_hadd_epi32(v_add[4], v_add[5]);
+    v_hadd[3] = _mm256_hadd_epi32(v_add[6], v_add[7]);
+
+    __m256i v_trunc[4];
+    v_trunc[0] = truncate_avx2(v_hadd[0], debias, shift);
+    v_trunc[1] = truncate_avx2(v_hadd[1], debias, shift);
+    v_trunc[2] = truncate_avx2(v_hadd[2], debias, shift);
+    v_trunc[3] = truncate_avx2(v_hadd[3], debias, shift);
+
+    v_tmp[s + 0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+    v_tmp[s + 1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle);
+  }
+
+  __m256i v_tmp64_lo[8];
+  __m256i v_tmp64_hi[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_tmp64_lo[d] = _mm256_unpacklo_epi64(v_tmp[s + 0], v_tmp[s + 1]);
+    v_tmp64_hi[d] = _mm256_unpackhi_epi64(v_tmp[s + 0], v_tmp[s + 1]);
+  }
+
+  __m256i v_result[16];
+  for (int d = 0, s = 0; d < 16; d += 2, ++s) {
+    v_result[d + 0] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x20);
+    v_result[d + 1] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x31);
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 8;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
+  const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename
+  }
+  if (ver == DST7) {
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
+  } else if (ver == DCT8) {
+    ver_coeff = &uvg_g_dct8_32[0][0];
+  }
+
+  __m256i v_ver_pass_out[16];
+  fast_inverse_tr_8x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_8x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_DCT2_B16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // ISP_TODO: might be faster to load these from arrays
+  const __m256i v_permute_0 = _mm256_set1_epi32(0);
+  const __m256i v_permute_1 = _mm256_set1_epi32(1);
+  const __m256i v_permute_2 = _mm256_set1_epi32(2);
+  const __m256i v_permute_3 = _mm256_set1_epi32(3);
+  const __m256i v_permute_4 = _mm256_set1_epi32(4);
+  const __m256i v_permute_5 = _mm256_set1_epi32(5);
+  const __m256i v_permute_6 = _mm256_set1_epi32(6);
+  const __m256i v_permute_7 = _mm256_set1_epi32(7);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const int reduced_line = line - skip_line;
+  // Handle 1 line at a time, 16 samples per line
+  for (int j = 0; j < reduced_line; ++j) {
+    //                    line 1
+    // src vector:       [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15]
+    __m256i v_src_raw = _mm256_load_si256((const __m256i*)src);
+
+    // Arrange data so calculations can be done column-wise (to avoid using hadds).
+    // Need 8 source vectors. First will be filled with s00 and s01 pairs. Second with s02 and s03 pairs and so on
+    __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0);
+    __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1);
+    __m256i v_src_2 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_2);
+    __m256i v_src_3 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_3);
+    __m256i v_src_4 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_4);
+    __m256i v_src_5 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_5);
+    __m256i v_src_6 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_6);
+    __m256i v_src_7 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_7);
+
+    __m256i v_madd_0_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    __m256i v_madd_0_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    __m256i v_madd_0_02 = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    __m256i v_madd_0_03 = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+    __m256i v_madd_0_04 = _mm256_madd_epi16(v_src_4, v_coeff[4]);
+    __m256i v_madd_0_05 = _mm256_madd_epi16(v_src_5, v_coeff[5]);
+    __m256i v_madd_0_06 = _mm256_madd_epi16(v_src_6, v_coeff[6]);
+    __m256i v_madd_0_07 = _mm256_madd_epi16(v_src_7, v_coeff[7]);
+
+    __m256i v_madd_0_08 = _mm256_madd_epi16(v_src_0, v_coeff[8]);
+    __m256i v_madd_0_09 = _mm256_madd_epi16(v_src_1, v_coeff[9]);
+    __m256i v_madd_0_10 = _mm256_madd_epi16(v_src_2, v_coeff[10]);
+    __m256i v_madd_0_11 = _mm256_madd_epi16(v_src_3, v_coeff[11]);
+    __m256i v_madd_0_12 = _mm256_madd_epi16(v_src_4, v_coeff[12]);
+    __m256i v_madd_0_13 = _mm256_madd_epi16(v_src_5, v_coeff[13]);
+    __m256i v_madd_0_14 = _mm256_madd_epi16(v_src_6, v_coeff[14]);
+    __m256i v_madd_0_15 = _mm256_madd_epi16(v_src_7, v_coeff[15]);
+
+    __m256i  v_madd_1_0 = _mm256_add_epi32(v_madd_0_00, v_madd_0_01);
+    __m256i  v_madd_1_1 = _mm256_add_epi32(v_madd_0_02, v_madd_0_03);
+    __m256i  v_madd_1_2 = _mm256_add_epi32(v_madd_0_04, v_madd_0_05);
+    __m256i  v_madd_1_3 = _mm256_add_epi32(v_madd_0_06, v_madd_0_07);
+    __m256i  v_madd_1_4 = _mm256_add_epi32(v_madd_0_08, v_madd_0_09);
+    __m256i  v_madd_1_5 = _mm256_add_epi32(v_madd_0_10, v_madd_0_11);
+    __m256i  v_madd_1_6 = _mm256_add_epi32(v_madd_0_12, v_madd_0_13);
+    __m256i  v_madd_1_7 = _mm256_add_epi32(v_madd_0_14, v_madd_0_15);
+
+    __m256i  v_madd_2_0 = _mm256_add_epi32(v_madd_1_0, v_madd_1_1);
+    __m256i  v_madd_2_1 = _mm256_add_epi32(v_madd_1_2, v_madd_1_3);
+    __m256i  v_madd_2_2 = _mm256_add_epi32(v_madd_1_4, v_madd_1_5);
+    __m256i  v_madd_2_3 = _mm256_add_epi32(v_madd_1_6, v_madd_1_7);
+
+    __m256i  v_madd_3_0 = _mm256_add_epi32(v_madd_2_0, v_madd_2_1);
+    __m256i  v_madd_3_1 = _mm256_add_epi32(v_madd_2_2, v_madd_2_3);
+
+    __m256i v_trunc_0 = truncate_avx2(v_madd_3_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_madd_3_1, debias, shift);
+
+    __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    dst[0] = v_result;
+    
+    src += 16;
+    dst++;
+  }
+}
+
+static void fast_forward_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 2;
+  // TODO: might be able to get rid of skips in these tailored solutions
+  int skip_width = 0;
+  int skip_height = 0; // This is not used anywhere
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x2_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;    
+  }
+
+  __m256i v_hor_pass_out[2];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 2 source vectors
+  // Unpack -> samples to be added are adjacent
+  __m256i v_src_hi = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_lo = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+
+  __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]);
+  __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]);
+  __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]);
+  __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]);
+
+  __m256i v_trunc_hi_0 = truncate_avx2(v_madd_hi_0, debias, shift_2nd);
+  __m256i v_trunc_hi_1 = truncate_avx2(v_madd_hi_1, debias, shift_2nd);
+  __m256i v_trunc_lo_0 = truncate_avx2(v_madd_lo_0, debias, shift_2nd);
+  __m256i v_trunc_lo_1 = truncate_avx2(v_madd_lo_1, debias, shift_2nd);
+
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1);
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+}
+
+
+static void fast_inverse_tr_16x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+
+  const __m256i v_src_0 = _mm256_load_si256((const __m256i*) & src[0]);
+  const __m256i v_src_1 = _mm256_load_si256((const __m256i*) & src[16]);
+
+  const __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_0, v_src_1);
+  const __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_0, v_src_1);
+
+  __m256i v_trunc_0 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_0), debias, shift);
+  __m256i v_trunc_1 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_1), debias, shift);
+  __m256i v_trunc_2 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_0), debias, shift);
+  __m256i v_trunc_3 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_1), debias, shift);
+
+  dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+  dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+}
+
+static void fast_inverse_tr_16x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  __m256i v_madd_e[16];
+  __m256i v_madd_o[16];
+  for (int i = 0, c = 0; i < 16; ++i, c += 2) {
+    v_madd_e[i] = _mm256_madd_epi16(src[0], v_coeff[c + 0]);
+    v_madd_o[i] = _mm256_madd_epi16(src[1], v_coeff[c + 1]);
+  }
+
+  __m256i v_add[16];
+  for (int i = 0; i < 16; ++i) {
+    v_add[i] = _mm256_add_epi32(v_madd_e[i], v_madd_o[i]);
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    v_add[i] = _mm256_permute4x64_epi64(v_add[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_hadd_0[8];
+  for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[4];
+  for (int src = 0, dst = 0; dst < 4; ++dst, src += 2) {
+    v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]), debias, shift);
+  }
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+
+  __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+
+  _mm256_store_si256((__m256i*) & dst[0], v_result_0);
+  _mm256_store_si256((__m256i*) & dst[16], v_result_1);
+}
+
+static void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 2;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_2x16_coeff_ver;
+  }
+  // DST7 and DCT8 are not defined for this block size
+
+  __m256i v_ver_pass_out[2];
+  fast_inverse_tr_16x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_16x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 4;
+  // TODO: might be able to get rid of skips in these tailored solutions
+  int skip_width = 0;
+  int skip_height = 0; // This is not used anywhere
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x4_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x4_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 4 vectors
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]);
+
+  __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]);
+  __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[2]);
+  __m256i v_madd_hi_02 = _mm256_madd_epi16(v_src_hi_0, v_coeff[4]);
+  __m256i v_madd_hi_03 = _mm256_madd_epi16(v_src_hi_0, v_coeff[6]);
+  __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]);
+  __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[3]);
+  __m256i v_madd_hi_12 = _mm256_madd_epi16(v_src_hi_1, v_coeff[5]);
+  __m256i v_madd_hi_13 = _mm256_madd_epi16(v_src_hi_1, v_coeff[7]);
+
+  __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]);
+  __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[2]);
+  __m256i v_madd_lo_02 = _mm256_madd_epi16(v_src_lo_0, v_coeff[4]);
+  __m256i v_madd_lo_03 = _mm256_madd_epi16(v_src_lo_0, v_coeff[6]);
+  __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]);
+  __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[3]);
+  __m256i v_madd_lo_12 = _mm256_madd_epi16(v_src_lo_1, v_coeff[5]);
+  __m256i v_madd_lo_13 = _mm256_madd_epi16(v_src_lo_1, v_coeff[7]);
+
+  __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi_00, v_madd_hi_10);
+  __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi_01, v_madd_hi_11);
+  __m256i v_add_hi_2 = _mm256_add_epi32(v_madd_hi_02, v_madd_hi_12);
+  __m256i v_add_hi_3 = _mm256_add_epi32(v_madd_hi_03, v_madd_hi_13);
+
+  __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo_00, v_madd_lo_10);
+  __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo_01, v_madd_lo_11);
+  __m256i v_add_lo_2 = _mm256_add_epi32(v_madd_lo_02, v_madd_lo_12);
+  __m256i v_add_lo_3 = _mm256_add_epi32(v_madd_lo_03, v_madd_lo_13);
+
+  __m256i v_trunc_hi_0 = truncate_avx2(v_add_hi_0, debias, shift_2nd);
+  __m256i v_trunc_hi_1 = truncate_avx2(v_add_hi_1, debias, shift_2nd);
+  __m256i v_trunc_hi_2 = truncate_avx2(v_add_hi_2, debias, shift_2nd);
+  __m256i v_trunc_hi_3 = truncate_avx2(v_add_hi_3, debias, shift_2nd);
+
+  __m256i v_trunc_lo_0 = truncate_avx2(v_add_lo_0, debias, shift_2nd);
+  __m256i v_trunc_lo_1 = truncate_avx2(v_add_lo_1, debias, shift_2nd);
+  __m256i v_trunc_lo_2 = truncate_avx2(v_add_lo_2, debias, shift_2nd);
+  __m256i v_trunc_lo_3 = truncate_avx2(v_add_lo_3, debias, shift_2nd);
+  
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1);
+  __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_2, v_trunc_hi_2);
+  __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_3, v_trunc_hi_3);
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+  _mm256_store_si256((__m256i*)(dst + 32), v_result_2);
+  _mm256_store_si256((__m256i*)(dst + 48), v_result_3);
+}
+
+
+static void fast_inverse_tr_16x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[2], v_src_raw[3]);
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[2], v_src_raw[3]);
+
+  __m256i v_madd_lo_0[4];
+  __m256i v_madd_lo_1[4];
+  __m256i v_madd_hi_0[4];
+  __m256i v_madd_hi_1[4];
+  for (int i = 0; i < 4; i++) {
+    v_madd_lo_0[i] = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]);
+    v_madd_lo_1[i] = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]);
+
+    v_madd_hi_0[i] = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]);
+    v_madd_hi_1[i] = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]);
+
+    v_coeff += 2;
+  }
+
+  __m256i v_trunc_lo[4];
+  __m256i v_trunc_hi[4];
+  for (int i = 0; i < 4; ++i) {
+    v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[i], v_madd_lo_1[i]), debias, shift);
+    v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[i], v_madd_hi_1[i]), debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_hi[0]);
+  dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_hi[1]);
+  dst[2] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_hi[2]);
+  dst[3] = _mm256_packs_epi32(v_trunc_lo[3], v_trunc_hi[3]);
+}
+
+static void fast_inverse_tr_16x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector
+
+  __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[2], 0x20);
+  __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[2], 0x31);
+  __m256i v_src_2 = _mm256_permute2x128_si256(src[1], src[3], 0x20);
+  __m256i v_src_3 = _mm256_permute2x128_si256(src[1], src[3], 0x31);
+
+  __m256i v_madd_0[16];
+  __m256i v_madd_1[16];
+  __m256i v_madd_2[16];
+  __m256i v_madd_3[16];
+  for (int i = 0; i < 16; ++i) {
+    v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    v_madd_2[i] = _mm256_madd_epi16(v_src_2, v_coeff[0]);
+    v_madd_3[i] = _mm256_madd_epi16(v_src_3, v_coeff[1]);
+
+    v_coeff += 2;
+  }
+
+  __m256i v_add_0[16];
+  __m256i v_add_1[16];
+  for (int i = 0; i < 16; ++i) {
+    v_add_0[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]);
+    v_add_1[i] = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]);
+
+  }
+
+  __m256i v_hadd_0[16];
+  for (int i = 0; i < 16; ++i) {
+    v_hadd_0[i] = _mm256_hadd_epi32(v_add_0[i], v_add_1[i]);
+  }
+
+  __m256i v_hadd_1[8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_result[4];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+
+  __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 4;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_4x16_coeff_hor; // TODO: rename coeff tables
+  const int16_t* hor_coeff = fi_dct2_4x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_4x16_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_4x16_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4x16_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4x16_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_16x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_16x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 8;
+  
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[8];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* line_coeff = (const int32_t*)ver_coeff;
+
+  // Got 8 lines of samples. Handle two lines at a time (beacuse of unpack)
+  __m256i v_madd_hi[4][8];
+  __m256i v_madd_lo[4][8];
+  __m256i* v_src_ptr = v_hor_pass_out;
+  for (int i = 0; i < 4; ++i) {
+    __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]);
+    __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]);
+
+    // Apply coefficients
+    for (int ii = 0; ii < 8; ++ii) {
+      const int32_t coeff = line_coeff[ii];
+      const __m256i v_coeff = _mm256_set1_epi32(coeff);
+      v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff);
+      v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff);
+    }
+
+    line_coeff += 8;
+    v_src_ptr += 2;
+  }
+
+  // First round of additions
+  __m256i v_add_hi[2][8];
+  __m256i v_add_lo[2][8];
+  for (int i = 0; i < 2; ++i) {
+    for (int ii = 0; ii < 8; ++ii) {
+      const int offset = i * 2;
+      v_add_hi[i][ii] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]);
+      v_add_lo[i][ii] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]);
+    }
+  }
+
+  // Final round of additions, truncation and store
+  for (int ii = 0; ii < 8; ++ii) {
+    __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi[0][ii], v_add_hi[1][ii]), debias, shift_2nd);
+    __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo[0][ii], v_add_lo[1][ii]), debias, shift_2nd);
+    __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi);
+
+    _mm256_store_si256((__m256i*)dst, v_result);
+    dst += 16;
+  }
+}
+
+
+static void fast_inverse_tr_16x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo[4];
+  __m256i v_src_hi[4];
+  for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+    v_src_lo[dst] = _mm256_unpacklo_epi16(v_src_raw[src + 0], v_src_raw[src + 1]);
+    v_src_hi[dst] = _mm256_unpackhi_epi16(v_src_raw[src + 0], v_src_raw[src + 1]);
+  }
+
+  __m256i v_trunc_lo[8];
+  __m256i v_trunc_hi[8];
+
+  for (int c = 0; c < 8; c++) {
+    __m256i v_madd_lo[4];
+    __m256i v_madd_hi[4];
+    for (int i = 0; i < 4; ++i) {
+      v_madd_lo[i] = _mm256_madd_epi16(v_src_lo[i], v_coeff[i]);
+      v_madd_hi[i] = _mm256_madd_epi16(v_src_hi[i], v_coeff[i]);
+    }
+    v_coeff += 4;
+
+    __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo[0], v_madd_lo[1]);
+    __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo[2], v_madd_lo[3]);
+
+    __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi[0], v_madd_hi[1]);
+    __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi[2], v_madd_hi[3]);
+
+    v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_0, v_add_lo_1), debias, shift);
+    v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_0, v_add_hi_1), debias, shift);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
+  }
+}
+
+static void fast_inverse_tr_16x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  __m256i v_tmp32_lo_0 = _mm256_unpacklo_epi32(src[0], src[1]);
+  __m256i v_tmp32_lo_1 = _mm256_unpacklo_epi32(src[2], src[3]);
+  __m256i v_tmp32_lo_2 = _mm256_unpacklo_epi32(src[4], src[5]);
+  __m256i v_tmp32_lo_3 = _mm256_unpacklo_epi32(src[6], src[7]);
+
+  __m256i v_tmp32_hi_0 = _mm256_unpackhi_epi32(src[0], src[1]);
+  __m256i v_tmp32_hi_1 = _mm256_unpackhi_epi32(src[2], src[3]);
+  __m256i v_tmp32_hi_2 = _mm256_unpackhi_epi32(src[4], src[5]);
+  __m256i v_tmp32_hi_3 = _mm256_unpackhi_epi32(src[6], src[7]);
+
+  __m256i v_tmp64_lo_0 = _mm256_unpacklo_epi64(v_tmp32_lo_0, v_tmp32_lo_1);
+  __m256i v_tmp64_lo_1 = _mm256_unpacklo_epi64(v_tmp32_lo_2, v_tmp32_lo_3);
+  __m256i v_tmp64_lo_2 = _mm256_unpacklo_epi64(v_tmp32_hi_0, v_tmp32_hi_1);
+  __m256i v_tmp64_lo_3 = _mm256_unpacklo_epi64(v_tmp32_hi_2, v_tmp32_hi_3);
+
+  __m256i v_tmp64_hi_0 = _mm256_unpackhi_epi64(v_tmp32_lo_0, v_tmp32_lo_1);
+  __m256i v_tmp64_hi_1 = _mm256_unpackhi_epi64(v_tmp32_lo_2, v_tmp32_lo_3);
+  __m256i v_tmp64_hi_2 = _mm256_unpackhi_epi64(v_tmp32_hi_0, v_tmp32_hi_1);
+  __m256i v_tmp64_hi_3 = _mm256_unpackhi_epi64(v_tmp32_hi_2, v_tmp32_hi_3);
+
+  __m256i v_src[8];
+  v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x20);
+  v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x20);
+  v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x20);
+  v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x20);
+  v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x31);
+  v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x31);
+  v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x31);
+  v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x31);
+
+
+  __m256i v_trunc[16];
+  for (int c = 0; c < 16; ++c) {
+    __m256i v_madd[8];
+    for (int i = 0; i < 8; ++i) {
+      v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff[i]);
+    }
+    v_coeff += 8;
+
+    __m256i v_add_0[4];
+    for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+      v_add_0[dst] = _mm256_add_epi32(v_madd[src + 0], v_madd[src + 1]);
+    }
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]);
+
+    v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift);
+  }
+
+  __m256i v_result[8];
+  for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+    v_result[dst] = _mm256_packs_epi32(v_trunc[src + 0], v_trunc[src + 1]);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    v_result[i] = _mm256_shuffle_epi8(v_result[i], v_res_shuffle);
+  }
+
+  __m256i v_rtmp32_lo_0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp32_lo_1 = _mm256_unpacklo_epi32(v_result[2], v_result[3]);
+  __m256i v_rtmp32_lo_2 = _mm256_unpacklo_epi32(v_result[4], v_result[5]);
+  __m256i v_rtmp32_lo_3 = _mm256_unpacklo_epi32(v_result[6], v_result[7]);
+
+  __m256i v_rtmp32_hi_0 = _mm256_unpackhi_epi32(v_result[0], v_result[1]);
+  __m256i v_rtmp32_hi_1 = _mm256_unpackhi_epi32(v_result[2], v_result[3]);
+  __m256i v_rtmp32_hi_2 = _mm256_unpackhi_epi32(v_result[4], v_result[5]);
+  __m256i v_rtmp32_hi_3 = _mm256_unpackhi_epi32(v_result[6], v_result[7]);
+
+  __m256i v_rtmp64_lo_0 = _mm256_unpacklo_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1);
+  __m256i v_rtmp64_lo_1 = _mm256_unpacklo_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3);
+  __m256i v_rtmp64_lo_2 = _mm256_unpacklo_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1);
+  __m256i v_rtmp64_lo_3 = _mm256_unpacklo_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3);
+
+  __m256i v_rtmp64_hi_0 = _mm256_unpackhi_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1);
+  __m256i v_rtmp64_hi_1 = _mm256_unpackhi_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3);
+  __m256i v_rtmp64_hi_2 = _mm256_unpackhi_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1);
+  __m256i v_rtmp64_hi_3 = _mm256_unpackhi_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3);
+
+  v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x31);
+  v_result[5] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 8;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x16_coeff_hor;
+  const int16_t* hor_coeff = fi_dct2_8x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_8x16_coeff_ver;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_8x16_coeff_ver;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x16_coeff_hor;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x16_coeff_hor;
+  }
+
+  __m256i v_ver_pass_out[8];
+  fast_inverse_tr_16x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_16x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 16;
+  
+  int skip_width = 0;
+  int skip_height = 0; 
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[16];
+  fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+#define NUM_PARTS 4
+#define PART_DIMENSION (16 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast into 32 bit integer to read two coeffs at a time
+    const __m256i* v_src_ptr = v_hor_pass_out;
+
+    __m256i v_madd_lo[8][PART_DIMENSION];
+    __m256i v_madd_hi[8][PART_DIMENSION];
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]);
+      __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]);
+
+      for (int c = 0; c < PART_DIMENSION; ++c) {
+        const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]);
+        v_madd_lo[i][c] = _mm256_madd_epi16(v_src_lo, v_coeff);
+        v_madd_hi[i][c] = _mm256_madd_epi16(v_src_hi, v_coeff);
+      }
+      v_src_ptr += 2;
+      coeff_ptr += 16;
+    }
+
+    __m256i v_trunc_lo[PART_DIMENSION];
+    __m256i v_trunc_hi[PART_DIMENSION];
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      __m256i v_add_lo_0[4];
+      __m256i v_add_hi_0[4];
+      for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+        v_add_lo_0[dst] = _mm256_add_epi32(v_madd_lo[src + 0][i], v_madd_lo[src + 1][i]);
+        v_add_hi_0[dst] = _mm256_add_epi32(v_madd_hi[src + 0][i], v_madd_hi[src + 1][i]);
+      }
+
+      __m256i v_add_lo_1[2];
+      __m256i v_add_hi_1[2];
+      for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) {
+        v_add_lo_1[dst] = _mm256_add_epi32(v_add_lo_0[src + 0], v_add_lo_0[src + 1]);
+        v_add_hi_1[dst] = _mm256_add_epi32(v_add_hi_0[src + 0], v_add_hi_0[src + 1]);
+      }
+
+      v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0], v_add_lo_1[1]), debias, shift_2nd);
+      v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0], v_add_hi_1[1]), debias, shift_2nd);
+    }
+    __m256i v_result[PART_DIMENSION];
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      v_result[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
+    }
+
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      _mm256_store_si256((__m256i*)dst, v_result[i]);
+      dst += 16;
+    }
+  }
+  
+#undef NUM_PARTS
+#undef PART_DIMENSION
+
+}
+
+
+static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  //const __m256i* v_src_raw = (const __m256i*)src;
+
+  //__m256i v_madd_lo[8][16];
+  //__m256i v_madd_hi[8][16];
+  //for (int s = 0; s < 8; ++s) {
+  //  __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]);
+  //  __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]);
+  //  v_src_raw += 2;
+
+  //  for (int c = 0; c < 16; ++c) {
+  //    const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+  //    v_madd_lo[s][c] = _mm256_madd_epi16(v_src_lo, v_coeff);
+  //    v_madd_hi[s][c] = _mm256_madd_epi16(v_src_hi, v_coeff);
+  //    c_ptr++;
+  //  }
+  //}
+
+  //__m256i v_add_lo_0[4][16];
+  //__m256i v_add_hi_0[4][16];
+  //for (int s = 0, d = 0; d < 4; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_lo_0[d][c] = _mm256_add_epi32(v_madd_lo[s + 0][c], v_madd_lo[s + 1][c]);
+  //    v_add_hi_0[d][c] = _mm256_add_epi32(v_madd_hi[s + 0][c], v_madd_hi[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_add_lo_1[2][16];
+  //__m256i v_add_hi_1[2][16];
+  //for (int s = 0, d = 0; d < 2; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_lo_1[d][c] = _mm256_add_epi32(v_add_lo_0[s + 0][c], v_add_lo_0[s + 1][c]);
+  //    v_add_hi_1[d][c] = _mm256_add_epi32(v_add_hi_0[s + 0][c], v_add_hi_0[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_trunc_lo[16];
+  //__m256i v_trunc_hi[16];
+  //for (int c = 0; c < 16; ++c) {
+  //  v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0][c], v_add_lo_1[1][c]), debias, shift);
+  //  v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0][c], v_add_hi_1[1][c]), debias, shift);
+  //}
+
+  //for (int i = 0; i < 16; ++i) {
+  //  dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
+  //}
+
+  for (int j = 0; j < line; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+
+    __m256i *coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < 8; ++i) {
+      int16_t source[2];
+      source[0] = src[j + i * 32];
+      source[1] = src[j + i * 32 + 16];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+
+      __m256i v_coeff0 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff1 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+
+      __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0);
+      __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1);
+
+      res_0 = _mm256_add_epi32(res_0, v_madd0);
+      res_1 = _mm256_add_epi32(res_1, v_madd1);
+    }
+
+    __m256i v_trunc0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc1 = truncate_avx2(res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    dst[j] = packed;
+  }
+}
+
+static void fast_inverse_tr_16x16_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  __m256i v_result[16];
+  int16_t *src_p = (int16_t*)src;
+  for (int j = 0; j < 16; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i* coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < 8; ++i) {
+      int16_t source[2];
+      source[0] = src_p[j + i * 32];
+      source[1] = src_p[j + i * 32 + 16];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+
+      __m256i coeff_0 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+      __m256i coeff_1 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+
+      __m256i madd0 = _mm256_madd_epi16(v_src, coeff_0);
+      __m256i madd1 = _mm256_madd_epi16(v_src, coeff_1);
+
+      res_0 = _mm256_add_epi32(res_0, madd0);
+      res_1 = _mm256_add_epi32(res_1, madd1);
+    }
+
+    __m256i v_trunc0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc1 = truncate_avx2(res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i *)dst, packed);
+    dst += 16;
+  }
+  //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  //const __m256i* v_src_raw = src;
+
+  //// Do a 32-bit transpose to arrange result from previous pass
+  //__m256i v_tmp32_lo[8];
+  //__m256i v_tmp32_hi[8];
+  //for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+  //  v_tmp32_lo[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 1]);
+  //  v_tmp32_hi[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 1]);
+  //}
+
+  //__m256i v_tmp64_lo[8];
+  //__m256i v_tmp64_hi[8];
+  //for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+  //  v_tmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]);
+  //  v_tmp64_lo[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]);
+
+  //  v_tmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]);
+  //  v_tmp64_hi[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]);
+  //}
+  //
+  //__m256i v_src[16];
+  //v_src[ 0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20);
+  //v_src[ 1] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20);
+  //v_src[ 2] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x20);
+  //v_src[ 3] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x20);
+  //v_src[ 4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31);
+  //v_src[ 5] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31);
+  //v_src[ 6] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x31);
+  //v_src[ 7] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x31);
+
+  //v_src[ 8] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20);
+  //v_src[ 9] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20);
+  //v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x20);
+  //v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x20);
+  //v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31);
+  //v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31);
+  //v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x31);
+  //v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x31);
+
+  //__m256i v_madd_0[8][16];
+  //__m256i v_madd_1[8][16];
+  //for (int s = 0; s < 8; ++s) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+  //    v_madd_0[s][c] = _mm256_madd_epi16(v_src[0 + s], v_coeff);
+  //    v_madd_1[s][c] = _mm256_madd_epi16(v_src[8 + s], v_coeff);
+  //    c_ptr++;
+  //  }
+  //}
+
+  //__m256i v_add_00[4][16];
+  //__m256i v_add_01[4][16];
+  //for (int s = 0, d = 0; d < 4; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_00[d][c] = _mm256_add_epi32(v_madd_0[s + 0][c], v_madd_0[s + 1][c]);
+  //    v_add_01[d][c] = _mm256_add_epi32(v_madd_1[s + 0][c], v_madd_1[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_add_10[2][16];
+  //__m256i v_add_11[2][16];
+  //for (int s = 0, d = 0; d < 2; ++d, s += 2) {
+  //  for (int c = 0; c < 16; ++c) {
+  //    v_add_10[d][c] = _mm256_add_epi32(v_add_00[s + 0][c], v_add_00[s + 1][c]);
+  //    v_add_11[d][c] = _mm256_add_epi32(v_add_01[s + 0][c], v_add_01[s + 1][c]);
+  //  }
+  //}
+
+  //__m256i v_trunc_0[16];
+  //__m256i v_trunc_1[16];
+  //for (int c = 0; c < 16; ++c) {
+  //  v_trunc_0[c] = truncate_avx2(_mm256_add_epi32(v_add_10[0][c], v_add_10[1][c]), debias, shift);
+  //  v_trunc_1[c] = truncate_avx2(_mm256_add_epi32(v_add_11[0][c], v_add_11[1][c]), debias, shift);
+  //}
+
+  //__m256i v_result[16];
+  //for (int d = 0; d < 16; ++d) {
+  //  v_result[d] = _mm256_packs_epi32(v_trunc_0[d], v_trunc_1[d]);
+  //}
+  //for (int d = 0; d < 16; ++d) {
+  //  v_result[d] = _mm256_permute4x64_epi64(v_result[d], _MM_SHUFFLE(3, 1, 2, 0));
+  //}
+  
+  //transpose_avx2(v_result, (__m256i*)dst, 16, 16);
+}
+
+static void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 16;
+
+  int skip_width = 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor;
+  const int16_t* ver_coeff = fi_dct2_16x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x16_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x16_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_16x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_16x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[16];
+  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
+
+  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
+}
+
+
+static void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 32;
+  
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_16x32_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_16xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_16xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_16x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_16x32_coeff_ver;
+  }
+
+  int16_t v_hor_pass_out[32*16];
+  fast_forward_DCT2_B16_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+
+  __m256i temp_out[32];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  if(ver == DCT2) {
+    for (int j = 0; j < 16; ++j) {
+      __m256i res_0 = _mm256_setzero_si256();
+      __m256i res_1 = _mm256_setzero_si256();
+      __m256i res_2 = _mm256_setzero_si256();
+      __m256i res_3 = _mm256_setzero_si256();
+      const int16_t* coeff_start = ver_coeff;
+      for (int i = 0; i < 16; ++i) {
+        int16_t source[2];
+        source[0] = v_hor_pass_out[j + i * 32];
+        source[1] = v_hor_pass_out[j + i * 32 + 16];
+        int32_t paired_source;
+        memcpy(&paired_source, source, sizeof(int32_t));
+
+        __m256i v_src = _mm256_set1_epi32(paired_source);
+        __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+
+        __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+        __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+        __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+        __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+        res_0 = _mm256_add_epi32(res_0, madd_0);
+        res_1 = _mm256_add_epi32(res_1, madd_1);
+        res_2 = _mm256_add_epi32(res_2, madd_2);
+        res_3 = _mm256_add_epi32(res_3, madd_3);
+      }
+      __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+      __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+      __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+      __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+
+      v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+      v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+      v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+      v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+      _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1);
+    }
+    transpose_avx2(temp_out, (__m256i*) dst, 32, 16);
+  }
+  else {
+    for (int j = 0; j < 16; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    for (int i = 0; i < 16; ++i) {
+      int16_t source[2];
+      source[0] = v_hor_pass_out[j + i * 32];
+      source[1] = v_hor_pass_out[j + i * 32 + 16];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 48;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 16);
+  }
+#if 0
+  // To how many parts the vertical pass should be split.
+  // At least on my testing it seems that there is no further gain by splitting to more than 4 parts.
+#define NUM_PARTS 4
+#define PART_DIMENSION (32/NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    // Got 32 / NUM_PARTS lines of samples. Handle two lines at a time (beacuse of unpack)
+    __m256i v_madd_hi[16][PART_DIMENSION];
+    __m256i v_madd_lo[16][PART_DIMENSION];
+    // Samples are the same between the parts
+    __m256i* v_src_ptr = v_hor_pass_out;
+    // However for coefficients, the starting point needs to be adjusted
+    const int32_t* line_coeff = (const int32_t*)ver_coeff + PART_DIMENSION * part;
+    for (int i = 0; i < 16; ++i) {
+      __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]);
+      __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]);
+
+      // Apply coefficients
+      // TODO: Here try loading the coefficient directly instead of set1
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        const int32_t coeff = line_coeff[ii];
+        const __m256i v_coeff = _mm256_set1_epi32(coeff);
+        v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff);
+        v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff);
+      }
+
+      line_coeff += 32;
+      v_src_ptr += 2;
+    }
+
+    for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+      // First round of additions
+      __m256i v_add_hi_0[8];
+      __m256i v_add_lo_0[8];
+      for (int i = 0; i < 8; ++i) {
+        const int offset = i * 2;
+        v_add_hi_0[i] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]);
+        v_add_lo_0[i] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]);
+      }
+
+      // Second round of additions
+      __m256i v_add_hi_1[4];
+      __m256i v_add_lo_1[4];
+      for (int i = 0; i < 4; ++i) {
+        const int offset = i * 2;
+        v_add_hi_1[i] = _mm256_add_epi32(v_add_hi_0[offset], v_add_hi_0[offset + 1]);
+        v_add_lo_1[i] = _mm256_add_epi32(v_add_lo_0[offset], v_add_lo_0[offset + 1]);
+      }
+
+      // Third round of addtions
+      __m256i v_add_hi_2[2];
+      __m256i v_add_lo_2[2];
+      for (int i = 0; i < 2; ++i) {
+        const int offset = i * 2;
+        v_add_hi_2[i] = _mm256_add_epi32(v_add_hi_1[offset], v_add_hi_1[offset + 1]);
+        v_add_lo_2[i] = _mm256_add_epi32(v_add_lo_1[offset], v_add_lo_1[offset + 1]);
+      }
+
+      // Final round of additions, truncate and store
+      __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi_2[0], v_add_hi_2[1]), debias, shift_2nd);
+      __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo_2[0], v_add_lo_2[1]), debias, shift_2nd);
+      __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi);
+      _mm256_store_si256((__m256i*)dst, v_result);
+
+      dst += 16;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+#endif
+
+}
+
+
+static void fast_inverse_tr_16x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vectors at a time
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_tmp16_lo[16];
+  __m256i v_tmp16_hi[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 1]);
+    v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 1]);
+  }
+  int  row = 0;
+  for (; row < 32 - skip_line2; ++row) {
+    __m256i v_res_lo = _mm256_setzero_si256();
+    __m256i v_res_hi = _mm256_setzero_si256();
+    for (int i = 0; i < 16; ++i) {
+      const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+      __m256i v_madd_lo = _mm256_madd_epi16(v_tmp16_lo[i], v_coeff);
+      __m256i v_madd_hi = _mm256_madd_epi16(v_tmp16_hi[i], v_coeff);
+      c_ptr++;
+
+      v_res_lo = _mm256_add_epi32(v_res_lo, v_madd_lo);
+      v_res_hi = _mm256_add_epi32(v_res_hi, v_madd_hi);
+    }
+
+    __m256i v_trunc_lo = truncate_avx2(v_res_lo, debias, shift);
+    __m256i v_trunc_hi = truncate_avx2(v_res_hi, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi);
+    dst[row] = packed;
+  }
+
+  for (; row < 32; ++row) {
+    dst[row] = _mm256_setzero_si256();
+  }
+}
+
+static void fast_inverse_tr_16x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  int32_t * src_32 = (int32_t *)src;
+  for (int row = 0, d = 0; row < 32; ++row) {
+    __m256i v_res_0 = _mm256_setzero_si256();
+    __m256i v_res_1 = _mm256_setzero_si256();
+    __m256i *coeff_start = (__m256i*) coeff;
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_src = _mm256_set1_epi32(*src_32);
+      src_32++;
+
+      __m256i v_madd_0 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start));
+      coeff_start++;
+      __m256i v_madd_1 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start));
+      coeff_start++;
+
+      v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0);
+      v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1);
+    }
+
+    __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i*) dst + row, packed);
+  }
+}
+
+static void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 16;
+  const int height = 32;
+
+  int skip_width = 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
+  const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_16x32_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
+  } else if (ver == DCT8) {
+    ver_coeff = &uvg_g_dct8_32[0][0];
+  }
+
+  __m256i v_ver_pass_out[32];
+  fast_inverse_tr_16x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, 0);
+  int16_t* ver_pass_out = (int16_t*)v_ver_pass_out;
+  fast_inverse_tr_16x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_DCT2_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int reduced_line = line - skip_line;
+
+  for(int j = 0; j < reduced_line; ++j) {
+    int32_t source[16];
+    memcpy(source, src, sizeof(int16_t) * 32);
+    src += 32;
+
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t *coeff_start = coeff;
+    for(int i = 0; i < 16; i++) {
+      __m256i v_src = _mm256_set1_epi32(source[i]);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+      __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      res_2 = _mm256_add_epi32(res_2, madd_2);
+      res_3 = _mm256_add_epi32(res_3, madd_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift);
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+    if(line == 32) {
+      v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+      v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0));
+    }
+
+    _mm256_store_si256(dst, v_trunc_0);
+    dst++;
+    _mm256_store_si256(dst, v_trunc_1);
+    dst++;
+  }
+}
+
+static void fast_forward_DCT8_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int  cutoff = 32 - skip_line2;
+  const int reduced_line = line - skip_line;
+
+  ALIGNED(32) int16_t temp_source[32 * 32];
+  __m256i* v_src_p = (__m256i*) src;
+  for (int i = 0; i < reduced_line / 2; ++i) {
+    __m256i first_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]);
+    __m256i first_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]);
+    __m256i second_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]);
+    __m256i second_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]);
+
+    _mm256_store_si256((__m256i*)temp_source + i * 4, first_half_lo);
+    _mm256_store_si256((__m256i*)temp_source + i * 4 + 1, first_half_hi);
+    _mm256_store_si256((__m256i*)temp_source + i * 4 + 2, second_half_lo);
+    _mm256_store_si256((__m256i*)temp_source + i * 4 + 3, second_half_hi);
+  }
+
+  for (int j = 0; j < reduced_line / 2; ++j) {
+
+    int32_t source[32];
+    memcpy(source, temp_source + 64 * j, sizeof(int16_t) * 64);
+
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = coeff;
+
+    for (int i = 0; i < 32; i += 2) {
+      __m256i v_src0 = _mm256_set1_epi32(source[i]);      
+      __m256i v_src1 = _mm256_set1_epi32(source[i + 1]);
+
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 48;
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src0, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src0, v_coeff_1);
+      __m256i madd_2 = _mm256_madd_epi16(v_src1, v_coeff_0);
+      __m256i madd_3 = _mm256_madd_epi16(v_src1, v_coeff_1);
+
+      res_0 = _mm256_add_epi32(madd_0, res_0);
+      res_1 = _mm256_add_epi32(madd_1, res_1);
+      res_2 = _mm256_add_epi32(madd_2, res_2);
+      res_3 = _mm256_add_epi32(madd_3, res_3);
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift);
+    
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+
+    if (line == 32) {
+      v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+      v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0));
+    }
+    _mm256_store_si256(dst, v_trunc_0);
+    dst+=2;
+    _mm256_store_si256(dst, v_trunc_2);
+    dst+=2;
+  }
+}
+
+
+static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_src_ptr = src;
+
+  // Prepare coeffs
+  // TODO: either rename these old coeff tables to be consistent with other new avx2 functions
+  // or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[16]);
+  
+  // Got data for 4 vectors, 32 lines with 2 samples each
+  __m256i v_result_e[4];
+  __m256i v_result_o[4];
+  for (int j = 0; j < 4; ++j) {
+    const __m256i v_src = v_src_ptr[0];
+
+    v_result_e[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_0), debias, shift);
+    v_result_o[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_1), debias, shift);
+
+    v_src_ptr++;
+  }
+
+  __m256i v_tmp[4];
+  v_tmp[0] = _mm256_packs_epi32(v_result_e[0], v_result_e[1]);
+  v_tmp[1] = _mm256_packs_epi32(v_result_e[2], v_result_e[3]);
+  v_tmp[2] = _mm256_packs_epi32(v_result_o[0], v_result_o[1]);
+  v_tmp[3] = _mm256_packs_epi32(v_result_o[2], v_result_o[3]);
+
+  v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0));
+  v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0));
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)&dst[i * 16], v_tmp[i]);
+  }
+}
+
+static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Got data for 8 vectors, 32 lines with 4 samples each
+
+  // Prepare coeffs
+  const int16_t* coeff = &uvg_g_dct_4[0][0];
+  const int a = coeff[0];
+  const int b = coeff[1 * 4 + 0];
+  const int c = coeff[1 * 4 + 1];
+
+  __m256i v_coeff_0 = _mm256_set1_epi16(a);
+  __m256i v_coeff_1 = _mm256_setr_epi16(b, c, -c, -b, b, c, -c, -b, b, c, -c, -b, b, c, -c, -b);
+  __m256i v_coeff_2 = _mm256_setr_epi16(a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a);
+  __m256i v_coeff_3 = _mm256_setr_epi16(c, -b, b, -c, c, -b, b, -c, c, -b, b, -c, c, -b, b, -c);
+
+  const __m256i* v_src_ptr = src;
+  __m256i v_trunc_0[8];
+  __m256i v_trunc_1[8];
+  for (int j = 0; j < 8; ++j) {
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_3);
+
+    v_trunc_0[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift);
+    v_trunc_1[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift);
+
+    v_src_ptr++;
+  }
+
+  __m256i v_result[8];
+  __m256i v_tmp[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc_0[i] = _mm256_permute4x64_epi64(v_trunc_0[i], _MM_SHUFFLE(3, 1, 2, 0));
+    v_trunc_1[i] = _mm256_permute4x64_epi64(v_trunc_1[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+  v_tmp[0] = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]);
+  v_tmp[1] = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]);
+  v_tmp[2] = _mm256_packs_epi32(v_trunc_0[4], v_trunc_0[5]);
+  v_tmp[3] = _mm256_packs_epi32(v_trunc_0[6], v_trunc_0[7]);
+  v_tmp[4] = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]);
+  v_tmp[5] = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]);
+  v_tmp[6] = _mm256_packs_epi32(v_trunc_1[4], v_trunc_1[5]);
+  v_tmp[7] = _mm256_packs_epi32(v_trunc_1[6], v_trunc_1[7]);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31);
+
+  v_result[4] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x20);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x20);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)&dst[i * 16], v_result[i]);
+  }
+}
+
+
+static void fast_forward_DCT2_32x8_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  int16_t* const p_dst = dst;
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  // Re-use coeff table
+  const __m256i* v_coeff = (const __m256i*)ff_dct2_16x8_coeff_ver;
+
+  const int reduced_line = line - skip_line;
+  const __m256i* v_src_ptr = src;
+  __m256i v_tmp_result[16];
+  // Handle 2 lines at a time (16 samples, 8 samples per line)
+  for (int i = 0; i < 16; ++i) {
+    //                    line 1                    line 2
+    // src vector:       [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7]
+    // __m256i    v_src = _mm256_load_si256((const __m256i*)src);
+
+    // Rearrange source in a way samples can be added together column-wise using add
+    // after first round of madd operations.
+    // Need 4 source vectors arranged as follows. High 128 lanes are the same as low:
+    // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...]
+    // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...]
+    // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...]
+    // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...]
+
+    __m256i  v_src_0 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i  v_src_1 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i  v_src_2 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i  v_src_3 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(3, 3, 3, 3));
+
+    // Lane 1
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+
+    // Lane 2
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]);
+
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    // Trunc results from both lanes
+    __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift);
+
+    v_tmp_result[i] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+
+    v_src_ptr++;
+  }
+
+  __m256i v_result[16];
+  transpose_avx2(v_tmp_result, v_result, 8, 32);
+
+  for (int i = 0; i < 16; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+}
+
+
+static void fast_forward_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 2;
+  
+  int skip_width  = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x2_coeff_ver;
+
+  __m256i v_hor_pass_out[4];
+  fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 4 source vectors, 2 lines 32 samples each
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[2]);
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[2]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[1], v_hor_pass_out[3]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[1], v_hor_pass_out[3]);
+  
+  __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]);
+  __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[1]);
+  __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[0]);
+  __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]);
+
+  __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]);
+  __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[1]);
+  __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[0]);
+  __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]);
+
+  __m256i v_trunc_hi_00 = truncate_avx2(v_madd_hi_00, debias, shift_2nd);
+  __m256i v_trunc_hi_01 = truncate_avx2(v_madd_hi_01, debias, shift_2nd);
+  __m256i v_trunc_hi_10 = truncate_avx2(v_madd_hi_10, debias, shift_2nd);
+  __m256i v_trunc_hi_11 = truncate_avx2(v_madd_hi_11, debias, shift_2nd);
+
+  __m256i v_trunc_lo_00 = truncate_avx2(v_madd_lo_00, debias, shift_2nd);
+  __m256i v_trunc_lo_01 = truncate_avx2(v_madd_lo_01, debias, shift_2nd);
+  __m256i v_trunc_lo_10 = truncate_avx2(v_madd_lo_10, debias, shift_2nd);
+  __m256i v_trunc_lo_11 = truncate_avx2(v_madd_lo_11, debias, shift_2nd);
+
+  __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_hi_00);
+  __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_hi_10); // Swap middle hi-lo lanes
+  __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_01, v_trunc_hi_01);
+  __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_11, v_trunc_hi_11);
+
+  // Swap middle 64-bit chunks
+  v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_2 = _mm256_permute4x64_epi64(v_result_2, _MM_SHUFFLE(3, 1, 2, 0));
+  v_result_3 = _mm256_permute4x64_epi64(v_result_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm256_store_si256((__m256i*)dst, v_result_0);
+  _mm256_store_si256((__m256i*)(dst + 16), v_result_1);
+  _mm256_store_si256((__m256i*)(dst + 32), v_result_2);
+  _mm256_store_si256((__m256i*)(dst + 48), v_result_3);
+}
+
+
+static void fast_inverse_tr_32x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]);
+  const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]);
+
+  const __m256i* v_src = (const __m256i*)src;
+
+  __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src[0], v_src[2]);
+  __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src[1], v_src[3]);
+  __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src[0], v_src[2]);
+  __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src[1], v_src[3]);
+
+  __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_0), debias, shift);
+  __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_1), debias, shift);
+  __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_0), debias, shift);
+  __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_1), debias, shift);
+
+  __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_0), debias, shift);
+  __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_1), debias, shift);
+  __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_0), debias, shift);
+  __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_1), debias, shift);
+
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11);
+
+  dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+  dst[2] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20);
+  dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31);
+}
+
+static void fast_inverse_tr_32x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  __m256i v_src[4];
+  for (int i = 0; i < 4; ++i) {
+    v_src[i] = _mm256_permute4x64_epi64(src[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_add[32];
+  for (int i = 0; i < 32; ++i) {
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    v_add[i] = _mm256_add_epi32(v_add_00, v_add_01);
+    v_coeff += 4;
+  }
+
+  __m256i v_hadd_0[16];
+  for (int src = 0, dst = 0; dst < 16; ++dst, src += 2) {
+    v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_hadd_1[8];
+  for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) {
+    v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]);
+  }
+
+  __m256i v_trunc[8];
+  for (int i = 0; i < 8; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift);
+  }
+
+  __m256i v_result[4];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31);
+
+  for (int i = 0; i < 4; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  // TODO: cutoff for DCT8 and DST7
+}
+
+static void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 2;
+
+  int skip_width = 0; // DST7 and DCT8 are not defined for this size. Therefore no skip width needed.
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = fi_dct2_2x32_coeff_ver; // rename
+  // No DST7 and DCT8 tables needed.
+
+  __m256i v_ver_pass_out[4];
+  fast_inverse_tr_32x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  
+  fast_inverse_tr_32x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 4;
+  
+  int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x4_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x4_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x4_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[8];
+  if(hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);    
+  }
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)ver_coeff;
+
+  // Got samples for 8 vectors. 4 lines with 32 samples each. Need 2 vectors for each line
+  // Handle two lines at a time
+  __m256i v_madd_lo_even[2][4];
+  __m256i  v_madd_lo_odd[2][4];
+  __m256i v_madd_hi_even[2][4];
+  __m256i  v_madd_hi_odd[2][4];
+  __m256i* v_src_ptr = v_hor_pass_out;
+  for (int i = 0; i < 2; ++i) {
+    __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+    __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+    __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+    __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+    // Apply coeffs
+    for (int ii = 0; ii < 4; ++ii) {
+      v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]);
+      v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]);
+      v_madd_lo_odd[i][ii]  = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]);
+      v_madd_hi_odd[i][ii]  = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]);
+    }
+
+    v_coeff += 4;
+    v_src_ptr += 4;
+  }
+
+  // Final add and truncate
+  __m256i v_trunc_lo_even[4];
+  __m256i v_trunc_hi_even[4];
+  __m256i v_trunc_lo_odd[4];
+  __m256i v_trunc_hi_odd[4];
+  for (int ii = 0; ii < 4; ++ii) {
+    v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_lo_even[0][ii], v_madd_lo_even[1][ii]), debias, shift_2nd);
+    v_trunc_lo_odd[ii]  = truncate_avx2(_mm256_add_epi32( v_madd_lo_odd[0][ii],  v_madd_lo_odd[1][ii]), debias, shift_2nd);
+    v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_hi_even[0][ii], v_madd_hi_even[1][ii]), debias, shift_2nd);
+    v_trunc_hi_odd[ii]  = truncate_avx2(_mm256_add_epi32( v_madd_hi_odd[0][ii],  v_madd_hi_odd[1][ii]), debias, shift_2nd);
+  }
+
+  // Permute and store
+  for (int i = 0; i < 4; ++i) {
+    __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]);
+    __m256i v_result_odd  = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]);
+    // Flip the middle 64 bit chunks
+    v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0));
+    v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256((__m256i*)dst, v_result_even);
+    _mm256_store_si256((__m256i*)(dst + 16), v_result_odd);
+    dst += 32;
+  }
+
+}
+
+
+static void fast_inverse_tr_32x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_madd_lo_0[2][4];
+  __m256i v_madd_lo_1[2][4];
+  __m256i v_madd_hi_0[2][4];
+  __m256i v_madd_hi_1[2][4];
+  const __m256i* v_c_ptr = v_coeff;
+  for (int src = 0; src < 2; ++src) {
+    __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]);
+    __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]);
+    __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]);
+    __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]);
+
+    for (int i = 0; i < 4; i++) {
+      v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]);
+      v_madd_lo_1[src][i] = _mm256_madd_epi16(v_src_lo_1, v_c_ptr[i]);
+      v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]);
+      v_madd_hi_1[src][i] = _mm256_madd_epi16(v_src_hi_1, v_c_ptr[i]);
+    }
+    v_c_ptr += 4;
+    v_src_raw += 4;
+  }
+
+  __m256i v_trunc_lo[8];
+  __m256i v_trunc_hi[8];
+  for (int dst = 0, src = 0; src < 4; ++src, dst += 2) {
+    v_trunc_lo[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift);
+    v_trunc_lo[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_lo_1[0][src], v_madd_lo_1[1][src]), debias, shift);
+    v_trunc_hi[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift);
+    v_trunc_hi[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_hi_1[0][src], v_madd_hi_1[1][src]), debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[2]);
+  dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[2]);
+  dst[4] = _mm256_packs_epi32(v_trunc_lo[4], v_trunc_lo[6]);
+  dst[6] = _mm256_packs_epi32(v_trunc_hi[4], v_trunc_hi[6]);
+
+  if(skip_line == 0) {
+    dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_lo[3]);
+    dst[3] = _mm256_packs_epi32(v_trunc_hi[1], v_trunc_hi[3]);
+    dst[5] = _mm256_packs_epi32(v_trunc_lo[5], v_trunc_lo[7]);
+    dst[7] = _mm256_packs_epi32(v_trunc_hi[5], v_trunc_hi[7]);
+  }
+  else {
+    dst[1] = _mm256_setzero_si256();
+    dst[3] = _mm256_setzero_si256();
+    dst[5] = _mm256_setzero_si256();
+    dst[7] = _mm256_setzero_si256();
+  }
+
+  // TODO: mts cutoff
+}
+static void fast_inverse_tr_32x4_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_madd_lo_0[2][4];
+  __m256i v_madd_hi_0[2][4];
+  const __m256i* v_c_ptr = v_coeff;
+  for (int src = 0; src < 2; ++src) {
+    __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]);
+    __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]);
+
+    for (int i = 0; i < 4; i++) {
+      v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]);
+      v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]);
+    }
+    v_c_ptr += 4;
+    v_src_raw += 4;
+  }
+
+  __m256i v_trunc_lo[4];
+  __m256i v_trunc_hi[4];
+  for (int src = 0; src < 4; ++src) {
+    v_trunc_lo[src] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift);
+    v_trunc_hi[src] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift);
+  }
+
+  dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[1]);
+  dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[1]);
+  dst[4] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_lo[3]);
+  dst[6] = _mm256_packs_epi32(v_trunc_hi[2], v_trunc_hi[3]);
+
+  dst[1] = _mm256_setzero_si256();
+  dst[3] = _mm256_setzero_si256();
+  dst[5] = _mm256_setzero_si256();
+  dst[7] = _mm256_setzero_si256();
+  
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_32x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector
+
+  __m256i v_src[8];
+  v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20);
+  v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31);
+  v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31);
+
+  v_src[4] = _mm256_permute2x128_si256(src[1], src[5], 0x20);
+  v_src[5] = _mm256_permute2x128_si256(src[3], src[7], 0x20);
+  v_src[6] = _mm256_permute2x128_si256(src[1], src[5], 0x31);
+  v_src[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31);
+
+  __m256i v_add[32];
+  for (int i = 0; i < 32; ++i) {
+    __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]);
+    __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]);
+    __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]);
+    __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]);
+    __m256i v_coeff_4 = _mm256_set1_epi64x(c_ptr[4]);
+    __m256i v_coeff_5 = _mm256_set1_epi64x(c_ptr[5]);
+    __m256i v_coeff_6 = _mm256_set1_epi64x(c_ptr[6]);
+    __m256i v_coeff_7 = _mm256_set1_epi64x(c_ptr[7]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3);
+    __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff_4);
+    __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff_5);
+    __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff_6);
+    __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff_7);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+    __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5);
+    __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+    __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03);
+
+    v_add[i] = _mm256_add_epi32(v_add_10, v_add_11);
+    c_ptr += 8;
+  }
+
+  __m256i v_hadd[16];
+  for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) {
+    v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[16];
+  for (int i = 0; i < 16; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  __m256i v_result[8];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+  __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]);
+  __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]);
+  __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]);
+  __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+  v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle);
+  v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle);
+  v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle);
+  v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle);
+
+  __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7);
+  __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+  // TODO: cutoff for dct8 and dst7
+}
+static void fast_inverse_tr_32x4_avx2_mts_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) {
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector
+
+  __m256i v_src[8];
+  v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20);
+  v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31);
+  v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31);
+  
+
+  __m256i v_add[32];
+  for (int i = 0; i < 32; ++i) {
+    __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]);
+    __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]);
+    __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]);
+    __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]);
+
+    __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0);
+    __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1);
+    __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2);
+    __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3);
+
+    __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1);
+    __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3);
+
+    __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01);
+
+    v_add[i] = v_add_10;
+    c_ptr += 8;
+  }
+
+  __m256i v_hadd[16];
+  for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) {
+    v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]);
+  }
+
+  __m256i v_trunc[16];
+  for (int i = 0; i < 16; ++i) {
+    v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift);
+  }
+
+  __m256i v_result[8];
+  __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]);
+  __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]);
+  __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]);
+  __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]);
+  __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]);
+  __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]);
+  __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]);
+  __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]);
+
+  v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle);
+  v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle);
+  v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle);
+  v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle);
+  v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle);
+  v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle);
+  v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle);
+  v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle);
+
+  __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7);
+  __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1);
+  __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3);
+  __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5);
+  __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7);
+
+  v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31);
+  v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31);
+  v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31);
+  v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31);
+
+  for (int i = 0; i < 8; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+  // TODO: cutoff for dct8 and dst7
+}
+
+static void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 4;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; 
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename
+  const int16_t* hor_coeff = &uvg_g_dct_32_t[0][0];
+  if (hor == DST7) {
+    hor_coeff = &uvg_g_dst7_32_t[0][0];
+  } else if (hor == DCT8) {
+    hor_coeff = &uvg_g_dct8_32[0][0];
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_4x32_coeff_hor; // TODO: rename
+  }
+
+  __m256i v_ver_pass_out[8];
+  if(ver == DCT2) {
+    fast_inverse_tr_32x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  }
+  else {
+    fast_inverse_tr_32x4_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+  }
+
+  if(hor == DCT2) {
+    fast_inverse_tr_32x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+  }
+  else {
+    fast_inverse_tr_32x4_avx2_mts_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+  }
+}
+
+
+static void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 8;
+  
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x8_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x8_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x8_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[16];
+  if (hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  // Same as for the other 32 and other dimension 8 or 16
+  // However all 1,2,4 seem to be producing similar results as with increasing the value
+  // just shifts the pressure from one point to another
+#define NUM_PARTS 4
+#define PART_DIMENSION (8 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    // Got data for 16 vectors, 8 lines 32 samples each
+    // Handle two lines at a time
+    __m256i v_madd_lo_even[4][PART_DIMENSION];
+    __m256i  v_madd_lo_odd[4][PART_DIMENSION];
+    __m256i v_madd_hi_even[4][PART_DIMENSION];
+    __m256i  v_madd_hi_odd[4][PART_DIMENSION];
+    __m256i* v_src_ptr = v_hor_pass_out;
+    const __m256i* v_coeff = (const __m256i*)ver_coeff + part * PART_DIMENSION;
+    for (int i = 0; i < 4; ++i) {
+      __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+      __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+      // Apply coeffs
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]);
+        v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]);
+        v_madd_lo_odd[i][ii]  = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]);
+        v_madd_hi_odd[i][ii]  = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]);
+      }
+
+      v_coeff += 8;
+      v_src_ptr += 4;
+    }
+
+    // First round of additions
+    __m256i v_add_lo_even[2][PART_DIMENSION];
+    __m256i v_add_hi_even[2][PART_DIMENSION];
+    __m256i  v_add_lo_odd[2][PART_DIMENSION];
+    __m256i  v_add_hi_odd[2][PART_DIMENSION];
+    for (int i = 0; i < 2; ++i) {
+      const int offset = 2 * i;
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        v_add_lo_even[i][ii] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]);
+        v_add_hi_even[i][ii] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]);
+        v_add_lo_odd[i][ii]  = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]);
+        v_add_hi_odd[i][ii]  = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]);
+      }
+    }
+
+    // Final add and truncate
+    __m256i v_trunc_lo_even[PART_DIMENSION];
+    __m256i v_trunc_hi_even[PART_DIMENSION];
+    __m256i v_trunc_lo_odd[PART_DIMENSION];
+    __m256i v_trunc_hi_odd[PART_DIMENSION];
+    for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+      v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_lo_even[0][ii], v_add_lo_even[1][ii]), debias, shift_2nd);
+      v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_hi_even[0][ii], v_add_hi_even[1][ii]), debias, shift_2nd);
+      v_trunc_lo_odd[ii]  = truncate_avx2(_mm256_add_epi32(v_add_lo_odd[0][ii], v_add_lo_odd[1][ii]), debias, shift_2nd);
+      v_trunc_hi_odd[ii]  = truncate_avx2(_mm256_add_epi32(v_add_hi_odd[0][ii], v_add_hi_odd[1][ii]), debias, shift_2nd);
+    }
+
+    // Permute and store
+    for (int i = 0; i < PART_DIMENSION; ++i) {
+      __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]);
+      __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]);
+      // Flip the middle 64 bit chunks
+      v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0));
+      v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256((__m256i*)dst, v_result_even);
+      _mm256_store_si256((__m256i*)(dst + 16), v_result_odd);
+      dst += 32;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+
+}
+
+
+static void fast_inverse_tr_32x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo[8];
+  __m256i v_src_hi[8];
+  for (int d = 0, s = 0; d < 8; d += 2, s += 4) {
+    v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src_lo[d + 1] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+
+    v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src_hi[d + 1] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+  }
+
+  for (int c = 0; c < 8; ++c) {
+    __m256i v_madd_lo_0[4];
+    __m256i v_madd_lo_1[4];
+    __m256i v_madd_hi_0[4];
+    __m256i v_madd_hi_1[4];
+    for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+      v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]);
+      v_madd_lo_1[d] = _mm256_madd_epi16(v_src_lo[s + 1], v_coeff[d]);
+      v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]);
+      v_madd_hi_1[d] = _mm256_madd_epi16(v_src_hi[s + 1], v_coeff[d]);
+    }
+    v_coeff += 4;
+
+    __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]);
+    __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]);
+    __m256i v_add_lo_10 = _mm256_add_epi32(v_madd_lo_1[0], v_madd_lo_1[1]);
+    __m256i v_add_lo_11 = _mm256_add_epi32(v_madd_lo_1[2], v_madd_lo_1[3]);
+
+    __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]);
+    __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]);
+    __m256i v_add_hi_10 = _mm256_add_epi32(v_madd_hi_1[0], v_madd_hi_1[1]);
+    __m256i v_add_hi_11 = _mm256_add_epi32(v_madd_hi_1[2], v_madd_hi_1[3]);
+
+    __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift);
+    __m256i v_trunc_lo_1 = truncate_avx2(_mm256_add_epi32(v_add_lo_10, v_add_lo_11), debias, shift);
+
+    __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift);
+    __m256i v_trunc_hi_1 = truncate_avx2(_mm256_add_epi32(v_add_hi_10, v_add_hi_11), debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+    dst[1] = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1);
+    dst += 2;
+  }
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_32x8_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_coeff = (const __m256i*)coeff;
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src_lo[4];
+  __m256i v_src_hi[4];
+  for (int d = 0, s = 0; d < 4; d += 1, s += 4) {
+    v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+  }
+
+  for (int c = 0; c < 8; ++c) {
+    __m256i v_madd_lo_0[4];
+    __m256i v_madd_hi_0[4];
+    for (int d = 0, s = 0; d < 4; ++d, s += 1) {
+      v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]);
+      v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]);
+    }
+    v_coeff += 4;
+
+    __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]);
+    __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]);
+
+    __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]);
+    __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]);
+
+    __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift);
+
+    __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift);
+
+    dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0);
+    dst[1] = _mm256_setzero_si256();
+    dst += 2;
+  }
+
+  // TODO: mts cutoff
+}
+
+static void fast_inverse_tr_32x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int limit = skip_line2 == 16 ? 8 : 16;
+
+  int32_t *src_32 = (int32_t*)src;
+  for (int j = 0; j < line; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+
+    __m256i *coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < limit; ++i) {
+      __m256i v_src = _mm256_set1_epi32(*src_32);
+      src_32++;
+
+      __m256i v_coeff0 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff1 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff2 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff3 = _mm256_loadu_si256(coeff_start);
+      coeff_start++;
+
+      __m256i madd0 = _mm256_madd_epi16(v_src, v_coeff0);
+      __m256i madd1 = _mm256_madd_epi16(v_src, v_coeff1);
+      __m256i madd2 = _mm256_madd_epi16(v_src, v_coeff2);
+      __m256i madd3 = _mm256_madd_epi16(v_src, v_coeff3);
+
+      res_0 = _mm256_add_epi32(res_0, madd0);
+      res_1 = _mm256_add_epi32(res_1, madd1);
+      res_2 = _mm256_add_epi32(res_2, madd2);
+      res_3 = _mm256_add_epi32(res_3, madd3);
+    }
+    src_32 += limit == 8 ? 8 : 0;
+
+    __m256i v_trunk0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunk1 = truncate_avx2(res_1, debias, shift);
+    __m256i v_trunk2 = truncate_avx2(res_2, debias, shift);
+    __m256i v_trunk3 = truncate_avx2(res_3, debias, shift);
+
+    __m256i packed0 =  _mm256_packs_epi32(v_trunk0, v_trunk1);
+    __m256i packed1 =  _mm256_packs_epi32(v_trunk2, v_trunk3);
+
+    packed0 = _mm256_permute4x64_epi64(packed0, _MM_SHUFFLE(3, 1, 2, 0));
+    packed1 = _mm256_permute4x64_epi64(packed1, _MM_SHUFFLE(3, 1, 2, 0));
+
+    _mm256_store_si256((__m256i*)dst, packed0);
+    _mm256_store_si256((__m256i*)dst + 1, packed1);
+    dst += 32;
+  }
+  
+  // TODO: cutoff for dct8 and dst7
+}
+
+static void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 8;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_8x32_coeff_hor; // TODO: rename this table
+  const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_8x32_coeff_hor; // TODO: rename
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_8x32_coeff_hor; // TODO: rename
+  }
+
+  __m256i v_ver_pass_out[16];
+  if(ver == DCT2 || hor == DCT2) {
+    fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_inverse_tr_32x8_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);    
+  }
+
+  fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 16;
+  
+  int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x16_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x16_coeff_ver;
+  }
+
+  __m256i v_hor_pass_out[32];
+  if (hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  // Same as for 8x32 and 16x32, 4 parts is optimal
+#define NUM_PARTS 4
+#define PART_DIMENSION (16 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    // Got samples for 32 vectors, 16 lines with 32 samples each
+    // Handle two lines at a time
+    __m256i v_madd_lo_even[8][PART_DIMENSION];
+    __m256i  v_madd_lo_odd[8][PART_DIMENSION];
+    __m256i v_madd_hi_even[8][PART_DIMENSION];
+    __m256i  v_madd_hi_odd[8][PART_DIMENSION];
+    __m256i* v_src_ptr = v_hor_pass_out;
+    const int32_t* line_coeff = (const int32_t*)ver_coeff + part * PART_DIMENSION;
+    for (int i = 0; i < 8; ++i) {
+      __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+      __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+      // Apply coeffs
+      for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+        const int32_t coeff = line_coeff[ii];
+        const __m256i v_coeff = _mm256_set1_epi32(coeff);
+        v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff);
+        v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff);
+        v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff);
+        v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff);
+      }
+
+      line_coeff += 16;
+      v_src_ptr += 4;
+    }
+
+    for (int ii = 0; ii < PART_DIMENSION; ++ii) {
+      // First round of additions
+      __m256i v_add_lo_even_0[4];
+      __m256i v_add_hi_even_0[4];
+      __m256i v_add_lo_odd_0[4];
+      __m256i v_add_hi_odd_0[4];
+      for (int i = 0; i < 4; ++i) {
+        const int offset = i * 2;
+        v_add_lo_even_0[i] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]);
+        v_add_hi_even_0[i] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]);
+        v_add_lo_odd_0[i] = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]);
+        v_add_hi_odd_0[i] = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]);
+      }
+
+      // Second round of additions
+      __m256i v_add_lo_even_1[2];
+      __m256i v_add_hi_even_1[2];
+      __m256i v_add_lo_odd_1[2];
+      __m256i v_add_hi_odd_1[2];
+      for (int i = 0; i < 2; ++i) {
+        const int offset = 2 * i;
+        v_add_lo_even_1[i] = _mm256_add_epi32(v_add_lo_even_0[offset], v_add_lo_even_0[offset + 1]);
+        v_add_hi_even_1[i] = _mm256_add_epi32(v_add_hi_even_0[offset], v_add_hi_even_0[offset + 1]);
+        v_add_lo_odd_1[i] = _mm256_add_epi32(v_add_lo_odd_0[offset], v_add_lo_odd_0[offset + 1]);
+        v_add_hi_odd_1[i] = _mm256_add_epi32(v_add_hi_odd_0[offset], v_add_hi_odd_0[offset + 1]);
+      }
+
+      // Final add and truncate
+      __m256i v_trunc_lo_even;
+      __m256i v_trunc_hi_even;
+      __m256i v_trunc_lo_odd;
+      __m256i v_trunc_hi_odd;
+      v_trunc_lo_even = truncate_avx2(_mm256_add_epi32(v_add_lo_even_1[0], v_add_lo_even_1[1]), debias, shift_2nd);
+      v_trunc_hi_even = truncate_avx2(_mm256_add_epi32(v_add_hi_even_1[0], v_add_hi_even_1[1]), debias, shift_2nd);
+      v_trunc_lo_odd = truncate_avx2(_mm256_add_epi32(v_add_lo_odd_1[0], v_add_lo_odd_1[1]), debias, shift_2nd);
+      v_trunc_hi_odd = truncate_avx2(_mm256_add_epi32(v_add_hi_odd_1[0], v_add_hi_odd_1[1]), debias, shift_2nd);
+
+
+      // Permute and store
+      __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even, v_trunc_hi_even);
+      __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd, v_trunc_hi_odd);
+      // Flip the middle 64 bit chunks
+      v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0));
+      v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256((__m256i*)dst, v_result_even);
+      _mm256_store_si256((__m256i*)(dst + 16), v_result_odd);
+      dst += 32;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+}
+
+
+static void fast_inverse_tr_32x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int limit = 32 - skip_line;
+  __m256i temp[32];
+  for (int j = 0; j < limit; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+
+    __m256i* coeff_start = (__m256i*)coeff;
+    for (int i = 0; i < 8; ++i) {
+      int16_t source[2];
+      source[0] = src[j + i * 64];
+      source[1] = src[j + i * 64 + 32];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+
+      __m256i v_coeff0 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+      __m256i v_coeff1 = _mm256_load_si256(coeff_start);
+      coeff_start++;
+
+      __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0);
+      __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1);
+
+      res_0 = _mm256_add_epi32(res_0, v_madd0);
+      res_1 = _mm256_add_epi32(res_1, v_madd1);
+    }
+
+    __m256i v_trunc0 = truncate_avx2(res_0, debias, shift);
+    __m256i v_trunc1 = truncate_avx2(res_1, debias, shift);
+
+    __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1);
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+    temp[j] = packed;
+  }
+  for (int j = limit; j < 32; ++j) {
+    temp[j] = _mm256_setzero_si256();
+  }
+  transpose_avx2(temp, dst, 16, 32);
+}
+
+static void fast_inverse_tr_32x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i* v_src_raw = src;
+  const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415);
+
+  // Do a 32-bit transpose to arrange result from previous pass
+  __m256i v_tmp32_lo_e[8];
+  __m256i v_tmp32_hi_e[8];
+  __m256i v_tmp32_lo_o[8];
+  __m256i v_tmp32_hi_o[8];
+  for (int d = 0, s = 0; d < 8; ++d, s += 4) {
+    v_tmp32_lo_e[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_tmp32_hi_e[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_tmp32_lo_o[d] = _mm256_unpacklo_epi32(v_src_raw[s + 1], v_src_raw[s + 3]);
+    v_tmp32_hi_o[d] = _mm256_unpackhi_epi32(v_src_raw[s + 1], v_src_raw[s + 3]);
+  }
+
+  __m256i v_tmp64_lo_e[8];
+  __m256i v_tmp64_hi_e[8];
+  __m256i v_tmp64_lo_o[8];
+  __m256i v_tmp64_hi_o[8];
+  for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+    v_tmp64_lo_e[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]);
+    v_tmp64_lo_e[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]);
+
+    v_tmp64_hi_e[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]);
+    v_tmp64_hi_e[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]);
+
+    v_tmp64_lo_o[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]);
+    v_tmp64_lo_o[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]);
+
+    v_tmp64_hi_o[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]);
+    v_tmp64_hi_o[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]);
+  }
+
+  __m256i v_src[32];
+  v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x20);
+  v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x20);
+  v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x20);
+  v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x20);
+
+  v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x31);
+  v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x31);
+  v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x31);
+  v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x31);
+
+  v_src[8] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x20);
+  v_src[9] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x20);
+  v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x20);
+  v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x20);
+
+  v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x31);
+  v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x31);
+  v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x31);
+  v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x31);
+
+  v_src[16] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x20);
+  v_src[17] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x20);
+  v_src[18] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x20);
+  v_src[19] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x20);
+
+  v_src[20] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x31);
+  v_src[21] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x31);
+  v_src[22] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x31);
+  v_src[23] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x31);
+
+  v_src[24] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x20);
+  v_src[25] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x20);
+  v_src[26] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x20);
+  v_src[27] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x20);
+
+  v_src[28] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x31);
+  v_src[29] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x31);
+  v_src[30] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x31);
+  v_src[31] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x31);
+
+  __m256i v_trunc[64];
+  __m256i* v_src_ptr = v_src;
+  __m256i* v_tr_ptr = v_trunc;
+
+
+  for (int chunk = 0; chunk < 2; ++chunk) {
+    const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+    for (int c = 0; c < 32; ++c) {
+      __m256i v_madd[16];
+      for (int i = 0; i < 16; ++i) {
+        const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+        v_madd[i] = _mm256_madd_epi16(v_src_ptr[i], v_coeff);
+        c_ptr++;
+      }
+
+      __m256i v_add_0[8];
+      for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+        v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]);
+      }
+
+      __m256i v_add_1[4];
+      for (int d = 0, s = 0; d < 4; ++d, s += 2) {
+        v_add_1[d] = _mm256_add_epi32(v_add_0[s + 0], v_add_0[s + 1]);
+      }
+
+      __m256i v_add_2[2];
+      for (int d = 0, s = 0; d < 2; ++d, s += 2) {
+        v_add_2[d] = _mm256_add_epi32(v_add_1[s + 0], v_add_1[s + 1]);
+      }
+
+      v_tr_ptr[c] = truncate_avx2(_mm256_add_epi32(v_add_2[0], v_add_2[1]), debias, shift);
+    }
+    v_tr_ptr += 32;
+    v_src_ptr += 16;
+  }
+
+  __m256i v_tmp[32];
+  __m256i v_result[32];
+  for (int i = 0, s = 0; i < 32; ++i, s += 2) {
+    v_tmp[i] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]);
+    v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle);
+  }
+
+  __m256i v_rtmp32_lo[16];
+  __m256i v_rtmp32_hi[16];
+  for (int d = 0, s = 0; d < 16; ++d, s += 2) {
+    v_rtmp32_lo[d] = _mm256_unpacklo_epi32(v_tmp[s + 0], v_tmp[s + 1]);
+    v_rtmp32_hi[d] = _mm256_unpackhi_epi32(v_tmp[s + 0], v_tmp[s + 1]);
+  }
+
+  __m256i v_rtmp64_lo[16];
+  __m256i v_rtmp64_hi[16];
+  for (int d = 0, s = 0; d < 8; ++d, s += 2) {
+    v_rtmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]);
+    v_rtmp64_lo[8 + d] = _mm256_unpacklo_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]);
+
+    v_rtmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]);
+    v_rtmp64_hi[8 + d] = _mm256_unpackhi_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]);
+  }
+
+  v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x20);
+  v_result[1] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x20);
+  v_result[2] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x20);
+  v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x20);
+
+  v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x20);
+  v_result[5] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x20);
+  v_result[6] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x20);
+  v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x20);
+
+  v_result[8] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x31);
+  v_result[9] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x31);
+  v_result[10] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x31);
+  v_result[11] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x31);
+
+  v_result[12] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x31);
+  v_result[13] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x31);
+  v_result[14] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x31);
+  v_result[15] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x31);
+
+  v_result[16] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x20);
+  v_result[17] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x20);
+  v_result[18] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x20);
+  v_result[19] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x20);
+
+  v_result[20] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x20);
+  v_result[21] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x20);
+  v_result[22] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x20);
+  v_result[23] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x20);
+
+  v_result[24] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x31);
+  v_result[25] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x31);
+  v_result[26] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x31);
+  v_result[27] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x31);
+
+  v_result[28] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x31);
+  v_result[29] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x31);
+  v_result[30] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x31);
+  v_result[31] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x31);
+
+  for (int i = 0; i < 32; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+
+  // TODO: MTS cutoff
+}
+
+static void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 16;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = 0;
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = fi_dct2_32x16_coeff_ver;
+  const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = fi_dst7_32x16_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = fi_dct8_32x16_coeff_ver;
+  }
+
+  __m256i v_ver_pass_out[32];
+  fast_inverse_tr_32x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 32;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int  reduced_line = width - skip_width;
+  const int  cutoff = height - skip_height;
+  int16_t* p_dst = dst;
+
+  const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
+  const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+  const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8;
+  const int32_t shift_2nd = log2_height_minus1 + 7;
+
+  const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor;
+  const int16_t* ver_coeff = ff_dct2_32x32_coeff_ver;
+  if (hor == DST7) {
+    hor_coeff = ff_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = ff_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = ff_dst7_32x32_coeff_ver;
+  } else if (ver == DCT8) {
+    ver_coeff = ff_dct8_32x32_coeff_ver;
+  }
+
+  ALIGNED(32) int16_t v_hor_pass_out[32 * 32];
+  if(hor == DCT2) {
+    fast_forward_DCT2_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  }
+  else {
+    fast_forward_DCT8_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);    
+  }
+
+  __m256i temp_out[32 * 2];
+  // Vertical pass
+  const int32_t    add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const __m256i debias = _mm256_set1_epi32(add);
+  for (int j = 0; j < reduced_line; ++j) {
+    __m256i res_0 = _mm256_setzero_si256();
+    __m256i res_1 = _mm256_setzero_si256();
+    __m256i res_2 = _mm256_setzero_si256();
+    __m256i res_3 = _mm256_setzero_si256();
+    const int16_t* coeff_start = ver_coeff;
+    for (int i = 0; i < 16; ++i) {
+      int16_t source[2];
+      source[0] = v_hor_pass_out[j + i * 64];
+      source[1] = v_hor_pass_out[j + i * 64 + 32];
+      int32_t paired_source;
+      memcpy(&paired_source, source, sizeof(int32_t));
+
+      __m256i v_src = _mm256_set1_epi32(paired_source);
+      __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start);
+      coeff_start += 16;
+      __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start);
+      __m256i v_coeff_2;
+      __m256i v_coeff_3;
+      if(skip_height == 0) {
+        coeff_start += 16;
+        v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+        v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start);
+        coeff_start += 16;
+      }
+      else {
+        coeff_start += 48;
+      }
+
+      __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0);
+      __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1);
+      __m256i madd_2;
+      __m256i madd_3;
+      if(skip_height == 0) {
+        madd_2 = _mm256_madd_epi16(v_src, v_coeff_2);
+        madd_3 = _mm256_madd_epi16(v_src, v_coeff_3);
+      }
+
+      res_0 = _mm256_add_epi32(res_0, madd_0);
+      res_1 = _mm256_add_epi32(res_1, madd_1);
+      if(skip_height == 0) {
+        res_2 = _mm256_add_epi32(res_2, madd_2);
+        res_3 = _mm256_add_epi32(res_3, madd_3);
+      }
+    }
+    __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd);
+    __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd);
+    __m256i v_trunc_2;
+    __m256i v_trunc_3;
+    if(skip_height == 0) {
+      v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd);
+      v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd);
+    }
+
+    v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_store_si256(temp_out + 2 * j, v_trunc_0);
+    if(skip_height == 0) {
+      v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+      v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0));
+      _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_2);
+    }
+  }
+  transpose_avx2(temp_out, (__m256i*) dst, 32, 32);
+#if 0
+  // 8 is probably best, though difference to 16 is not that large
+#define NUM_PARTS 8
+#define PART_DIMENSION (32 / NUM_PARTS)
+  for (int part = 0; part < NUM_PARTS; ++part) {
+    const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast to 32 bit integer to read 2 coeffs at a time
+    const __m256i* v_src_ptr = v_hor_pass_out;
+
+    __m256i v_madd_lo_e[16][PART_DIMENSION];
+    __m256i v_madd_lo_o[16][PART_DIMENSION];
+    __m256i v_madd_hi_e[16][PART_DIMENSION];
+    __m256i v_madd_hi_o[16][PART_DIMENSION];
+    for (int i = 0; i < 16; ++i) {
+      __m256i v_src_lo_e = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_lo_o = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]);
+      __m256i v_src_hi_e = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]);
+      __m256i v_src_hi_o = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]);
+
+
+      for (int c = 0; c < PART_DIMENSION; ++c) {
+        const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]);
+        v_madd_lo_e[i][c] = _mm256_madd_epi16(v_src_lo_e, v_coeff);
+        v_madd_lo_o[i][c] = _mm256_madd_epi16(v_src_lo_o, v_coeff);
+        v_madd_hi_e[i][c] = _mm256_madd_epi16(v_src_hi_e, v_coeff);
+        v_madd_hi_o[i][c] = _mm256_madd_epi16(v_src_hi_o, v_coeff);
+      }
+      coeff_ptr += 32;
+      v_src_ptr += 4;
+    }
+
+    for (int c = 0; c < PART_DIMENSION; ++c) {
+      __m256i v_add_lo_e0[8];
+      __m256i v_add_lo_o0[8];
+      __m256i v_add_hi_e0[8];
+      __m256i v_add_hi_o0[8];
+      for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) {
+        v_add_lo_e0[dst] = _mm256_add_epi32(v_madd_lo_e[src + 0][c], v_madd_lo_e[src + 1][c]);
+        v_add_lo_o0[dst] = _mm256_add_epi32(v_madd_lo_o[src + 0][c], v_madd_lo_o[src + 1][c]);
+        v_add_hi_e0[dst] = _mm256_add_epi32(v_madd_hi_e[src + 0][c], v_madd_hi_e[src + 1][c]);
+        v_add_hi_o0[dst] = _mm256_add_epi32(v_madd_hi_o[src + 0][c], v_madd_hi_o[src + 1][c]);      
+      }
+
+      __m256i v_add_lo_e1[4];
+      __m256i v_add_lo_o1[4];
+      __m256i v_add_hi_e1[4];
+      __m256i v_add_hi_o1[4];
+      for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) {
+        v_add_lo_e1[dst] = _mm256_add_epi32(v_add_lo_e0[src + 0], v_add_lo_e0[src + 1]);
+        v_add_lo_o1[dst] = _mm256_add_epi32(v_add_lo_o0[src + 0], v_add_lo_o0[src + 1]);
+        v_add_hi_e1[dst] = _mm256_add_epi32(v_add_hi_e0[src + 0], v_add_hi_e0[src + 1]);
+        v_add_hi_o1[dst] = _mm256_add_epi32(v_add_hi_o0[src + 0], v_add_hi_o0[src + 1]);      
+      }
+
+      __m256i v_add_lo_e2[2];
+      __m256i v_add_lo_o2[2];
+      __m256i v_add_hi_e2[2];
+      __m256i v_add_hi_o2[2];
+      for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) {
+        v_add_lo_e2[dst] = _mm256_add_epi32(v_add_lo_e1[src + 0], v_add_lo_e1[src + 1]);
+        v_add_lo_o2[dst] = _mm256_add_epi32(v_add_lo_o1[src + 0], v_add_lo_o1[src + 1]);
+        v_add_hi_e2[dst] = _mm256_add_epi32(v_add_hi_e1[src + 0], v_add_hi_e1[src + 1]);
+        v_add_hi_o2[dst] = _mm256_add_epi32(v_add_hi_o1[src + 0], v_add_hi_o1[src + 1]);
+      }
+
+      __m256i v_trunc_lo_e = truncate_avx2(_mm256_add_epi32(v_add_lo_e2[0], v_add_lo_e2[1]), debias, shift_2nd);
+      __m256i v_trunc_lo_o = truncate_avx2(_mm256_add_epi32(v_add_lo_o2[0], v_add_lo_o2[1]), debias, shift_2nd);
+      __m256i v_trunc_hi_e = truncate_avx2(_mm256_add_epi32(v_add_hi_e2[0], v_add_hi_e2[1]), debias, shift_2nd);
+      __m256i v_trunc_hi_o = truncate_avx2(_mm256_add_epi32(v_add_hi_o2[0], v_add_hi_o2[1]), debias, shift_2nd);
+
+      __m256i v_result_e = _mm256_packs_epi32(v_trunc_lo_e, v_trunc_hi_e);
+      __m256i v_result_o = _mm256_packs_epi32(v_trunc_lo_o, v_trunc_hi_o);
+
+      v_result_e = _mm256_permute4x64_epi64(v_result_e, _MM_SHUFFLE(3, 1, 2, 0));
+      v_result_o = _mm256_permute4x64_epi64(v_result_o, _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_store_si256((__m256i*)dst, v_result_e);
+      dst += 16;
+      _mm256_store_si256((__m256i*)dst, v_result_o);
+      dst += 16;
+    }
+  }
+#undef NUM_PARTS
+#undef PART_DIMENSION
+#endif
+
+}
+
+
+static void fast_inverse_tr_32x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+  const __m256i* v_src_raw = (const __m256i*)src;
+
+  __m256i v_src[16][4];
+  for (int d = 0, s = 0; d < 16; ++d, s += 4) {
+    v_src[d][0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src[d][1] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]);
+    v_src[d][2] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+    v_src[d][3] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]);
+  }
+
+  for (int row = 0, d = 0; row < 32; ++row, d += 2) {
+    __m256i v_res_0 = _mm256_setzero_si256();
+    __m256i v_res_1 = _mm256_setzero_si256();
+    __m256i v_res_2 = _mm256_setzero_si256();
+    __m256i v_res_3 = _mm256_setzero_si256();
+    if(skip_line == 0) {
+      for (int i = 0; i < 16; ++i) {
+        const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+        __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff);
+        __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff);
+        __m256i v_madd_2 = _mm256_madd_epi16(v_src[i][2], v_coeff);
+        __m256i v_madd_3 = _mm256_madd_epi16(v_src[i][3], v_coeff);
+        v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0);
+        v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1);
+        v_res_2 = _mm256_add_epi32(v_res_2, v_madd_2);
+        v_res_3 = _mm256_add_epi32(v_res_3, v_madd_3);
+        c_ptr++;
+      }
+
+      __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+      __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+      __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift);
+      __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift);
+
+      dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+      dst[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+    }
+    else {
+      for (int i = 0; i < 16; ++i) {
+        const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+        __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff);
+        __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff);
+        v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0);
+        v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1);
+        c_ptr++;
+      }
+
+      __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+      __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+
+      dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+      dst[d + 1] = _mm256_setzero_si256();
+    }
+  }
+}
+
+static void fast_inverse_tr_32x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  const int32_t    add = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time
+
+  // Do a 32 bit transpose on input
+  __m256i v_tmp32_lo[32];
+  __m256i v_tmp32_hi[32];
+  for (int d = 0, s = 0; d < 32; d += 2, s += 4) {
+    v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(src[s + 0], src[s + 2]);
+    v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(src[s + 1], src[s + 3]);
+    v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(src[s + 0], src[s + 2]);
+    v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(src[s + 1], src[s + 3]);
+  }
+
+  __m256i v_tmp64_lo[32];
+  __m256i v_tmp64_hi[32];
+  for (int i = 0; i < 32; i += 4) {
+    v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]);
+    v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]);
+    v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]);
+    v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]);
+
+    v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]);
+    v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]);
+    v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]);
+    v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]);
+  }
+
+  __m256i v_src[64];
+  for (int d = 0, s = 0; d < 64; d += 16, s += 8) {
+    v_src[d + 0]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x20);
+    v_src[d + 1]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x20);
+    v_src[d + 2]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x20);
+    v_src[d + 3]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x20);
+
+    v_src[d + 4]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x31);
+    v_src[d + 5]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x31);
+    v_src[d + 6]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x31);
+    v_src[d + 7]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x31);
+
+    v_src[d + 8]  = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x20);
+    v_src[d + 9]  = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x20);
+    v_src[d + 10] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x20);
+    v_src[d + 11] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x20);
+
+    v_src[d + 12] = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x31);
+    v_src[d + 13] = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x31);
+    v_src[d + 14] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x31);
+    v_src[d + 15] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x31);
+  }
+
+  __m256i v_tmp[64];
+  for (int row = 0, d = 0; row < 32; ++row, d += 2) {
+    __m256i v_res_0 = _mm256_setzero_si256();
+    __m256i v_res_1 = _mm256_setzero_si256();
+    __m256i v_res_2 = _mm256_setzero_si256();
+    __m256i v_res_3 = _mm256_setzero_si256();
+    for (int i = 0; i < 16; ++i) {
+      const __m256i v_coeff = _mm256_set1_epi32(*c_ptr);
+      __m256i v_madd_0 = _mm256_madd_epi16(v_src[i + 0], v_coeff);
+      __m256i v_madd_1 = _mm256_madd_epi16(v_src[i + 16], v_coeff);
+      __m256i v_madd_2 = _mm256_madd_epi16(v_src[i + 32], v_coeff);
+      __m256i v_madd_3 = _mm256_madd_epi16(v_src[i + 48], v_coeff);
+
+      v_res_0 = _mm256_add_epi32(v_madd_0, v_res_0);
+      v_res_1 = _mm256_add_epi32(v_madd_1, v_res_1);
+      v_res_2 = _mm256_add_epi32(v_madd_2, v_res_2);
+      v_res_3 = _mm256_add_epi32(v_madd_3, v_res_3);
+      c_ptr++;
+    }
+
+    __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift);
+    __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift);
+    __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift);
+    __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift);
+
+    v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
+    v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3);
+  }
+
+  for (int i = 0; i < 64; ++i) {
+    v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0));
+  }
+
+  __m256i v_result[64];
+  transpose_avx2(v_tmp, v_result, 32, 32);
+
+  for (int i = 0; i < 64; ++i) {
+    _mm256_store_si256((__m256i*)dst, v_result[i]);
+    dst += 16;
+  }
+}
+
+static void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+{
+  const int width = 32;
+  const int height = 32;
+
+  int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+  int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
+
+  const int32_t shift_1st = INVERSE_SHIFT_1ST;
+  const int32_t shift_2nd = INVERSE_SHIFT_2ND;
+
+  const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
+  const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
+  if (hor == DST7) {
+    hor_coeff = fi_dst7_32xN_coeff_hor;
+  } else if (hor == DCT8) {
+    hor_coeff = fi_dct8_32xN_coeff_hor;
+  }
+  if (ver == DST7) {
+    ver_coeff = &uvg_g_dst7_32_t[0][0];
+  } else if (ver == DCT8) {
+    ver_coeff = &uvg_g_dct8_32[0][0];
+  }
+
+  __m256i v_ver_pass_out[64];
+  fast_inverse_tr_32x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
+
+  fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
+}
+
+
+static dct_full_pass* dct_function_table[6][6] = {
+  { NULL,                      NULL,                      fast_forward_tr_2x8_avx2,  fast_forward_tr_2x16_avx2,  fast_forward_tr_2x32_avx2,  NULL },
+  { NULL,                      fast_forward_tr_4x4_avx2,  fast_forward_tr_4x8_avx2,  fast_forward_tr_4x16_avx2,  fast_forward_tr_4x32_avx2,  NULL },
+  { fast_forward_tr_8x2_avx2,  fast_forward_tr_8x4_avx2,  fast_forward_tr_8x8_avx2,  fast_forward_tr_8x16_avx2,  fast_forward_tr_8x32_avx2,  NULL },
+  { fast_forward_tr_16x2_avx2, fast_forward_tr_16x4_avx2, fast_forward_tr_16x8_avx2, fast_forward_tr_16x16_avx2, fast_forward_tr_16x32_avx2, NULL },
+  { fast_forward_tr_32x2_avx2, fast_forward_tr_32x4_avx2, fast_forward_tr_32x8_avx2, fast_forward_tr_32x16_avx2, fast_forward_tr_32x32_avx2, NULL },
+  { NULL,                      NULL,                      NULL,                      NULL,                       NULL,                       NULL }
+};
+
+
+static dct_full_pass* idct_function_table[6][6] = {
+  { NULL,                      NULL,                      fast_inverse_tr_2x8_avx2,  fast_inverse_tr_2x16_avx2,  fast_inverse_tr_2x32_avx2,  NULL },
+  { NULL,                      fast_inverse_tr_4x4_avx2,  fast_inverse_tr_4x8_avx2,  fast_inverse_tr_4x16_avx2,  fast_inverse_tr_4x32_avx2,  NULL },
+  { fast_inverse_tr_8x2_avx2,  fast_inverse_tr_8x4_avx2,  fast_inverse_tr_8x8_avx2,  fast_inverse_tr_8x16_avx2,  fast_inverse_tr_8x32_avx2,  NULL },
+  { fast_inverse_tr_16x2_avx2, fast_inverse_tr_16x4_avx2, fast_inverse_tr_16x8_avx2, fast_inverse_tr_16x16_avx2, fast_inverse_tr_16x32_avx2, NULL },
+  { fast_inverse_tr_32x2_avx2, fast_inverse_tr_32x4_avx2, fast_inverse_tr_32x8_avx2, fast_inverse_tr_32x16_avx2, fast_inverse_tr_32x32_avx2, NULL },
+  { NULL,                      NULL,                      NULL,                      NULL,                       NULL,                       NULL },
+};
+
 
 extern void uvg_get_tr_type(
   int8_t width,
+  int8_t height,
   color_t color,
   const cu_info_t* tu,
   tr_type_t* hor_out,
   tr_type_t* ver_out,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 static void mts_dct_avx2(
   const int8_t bitdepth,
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && width == height)
   {
-    dct_func* dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
-  else
-  {
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-
-    tr_func* dct = dct_table[log2_width_minus2];
-
-    dct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx);
+  else{
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    // Transforms with 1 lenght dimensions are handled separately since their interface differ from other full pass functions
+    if (height == 1) {
+      if (width == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0);
+      } else if (width == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0);        
+      }
+    }
+    else if (width == 1){
+      if (height == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0);
+      } else if (height == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0);        
+      }
+    }
+    else {
+      dct_full_pass* dct_func = dct_function_table[log2_width_minus1][log2_height_minus1];
+      dct_func(input, output, type_hor, type_ver);
+    }
   }
 }
 
@@ -1620,27 +7999,45 @@ static void mts_idct_avx2(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
-  if (type_hor == DCT2 && type_ver == DCT2)
+  if (type_hor == DCT2 && type_ver == DCT2 && width == height)
   {
-    dct_func* idct_func = uvg_get_idct_func(width, color, tu->type);
+    dct_func* idct_func = uvg_get_idct_func(width, height, color, tu->type);
     idct_func(bitdepth, input, output);
   }
-  else
-  {
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-
-    tr_func* idct = idct_table[log2_width_minus2];
-
-    idct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx);
+  else {
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    // Transforms with 1 lenght dimensions can be transformed with existing forward functions
+    if (height == 1) {
+      if (width == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
+        _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
+      } else if (width == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);        
+      }
+    }
+    else if (width == 1){
+      if (height == 16) {
+        fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
+        _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
+      } else if (height == 32) {
+        fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);        
+      }
+    }
+    else {
+      dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1];
+      idct_func(input, output, type_hor, type_ver);
+    }
   }
 }
 
@@ -1653,14 +8050,14 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
 #if COMPILE_INTEL_AVX2
 #if UVG_BIT_DEPTH == 8
   if (bitdepth == 8){
-    success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2);
+    //success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2);
 
     success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2);
     success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2);
     success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2);
     success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2);
 
-    success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2);
+    // success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2);
 
     success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2);
     success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2);
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
new file mode 100644
index 00000000..5d02b617
--- /dev/null
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -0,0 +1,4827 @@
+#ifndef DCT_AVX2_TABLES_H
+#define DCT_AVX2_TABLES_H
+
+#include "global.h"
+
+// Shuffle tables for simple avx2 functions
+
+ALIGNED(32) static const int32_t  ff_dct2_b4_permute_0[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
+ALIGNED(32) static const int32_t  ff_dct2_b4_permute_1[8] = { 1, 3, 5, 7, 1, 3, 5, 7 };
+
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_0[8] = { 0, 0, 0, 0, 2, 2, 2, 2 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_1[8] = { 4, 4, 4, 4, 6, 6, 6, 6 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_2[8] = { 1, 1, 1, 1, 3, 3, 3, 3 };
+ALIGNED(32) static const int32_t  fi_dct2_b4_permute_3[8] = { 5, 5, 5, 5, 7, 7, 7, 7 };
+
+ALIGNED(32) static const int32_t  ff_dct2_b32_permute[8][8] = {
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 1, 1, 1, 1},
+  {2, 2, 2, 2, 2, 2, 2, 2},
+  {3, 3, 3, 3, 3, 3, 3, 3},
+  {4, 4, 4, 4, 4, 4, 4, 4},
+  {5, 5, 5, 5, 5, 5, 5, 5},
+  {6, 6, 6, 6, 6, 6, 6, 6},
+  {7, 7, 7, 7, 7, 7, 7, 7},
+};
+
+
+// Coeff tables for simple avx2 functions
+
+ALIGNED(32) static const int16_t  fast_forward_dct2_b2_coeff[32] = {
+  64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64,
+  64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64,
+};
+
+static const int16_t* fast_inverse_dct2_b2_coeff = fast_forward_dct2_b2_coeff; // Inverse coeffs for this transform are same as forward
+
+// Coeff arrays for B4
+ALIGNED(32) static const int16_t  fast_forward_dct2_b4_coeff[64] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 64,  64,  64,  64,  64,  64,  64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  36, -83,  36, -83,  36, -83,  36, -83,
+-36, -83, -36, -83, -36, -83, -36, -83,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dst7_b4_coeff[64] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  84, -29,  84, -29,  84, -29,  84, -29,
+ 74,  84,  74,  84,  74,  84,  74,  84, -74,  55, -74,  55, -74,  55, -74,  55,
+ 74,  74,  74,  74,  74,  74,  74,  74,  55, -84,  55, -84,  55, -84,  55, -84,
+  0, -74,   0, -74,   0, -74,   0, -74,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dct8_b4_coeff[64] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  55, -74,  55, -74,  55, -74,  55, -74,
+ 55,  29,  55,  29,  55,  29,  55,  29, -29,  84, -29,  84, -29,  84, -29,  84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  29, -74,  29, -74,  29, -74,  29, -74,
+-74, -74, -74, -74, -74, -74, -74, -74,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+// Coeff arrays for inverse B4
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b4_coeff[64] = {
+ 64,  83,  64,  36,  64, -36,  64, -83,  64,  83,  64,  36,  64, -36,  64, -83,
+ 64,  36, -64, -83, -64,  83,  64, -36,  64,  36, -64, -83, -64,  83,  64, -36,
+ 64,  83,  64,  36,  64, -36,  64, -83,  64,  83,  64,  36,  64, -36,  64, -83,
+ 64,  36, -64, -83, -64,  83,  64, -36,  64,  36, -64, -83, -64,  83,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b4_coeff[64] = {
+ 29,  74,  55,  74,  74,   0,  84, -74,  29,  74,  55,  74,  74,   0,  84, -74,
+ 84,  55, -29, -84, -74,  74,  55, -29,  84,  55, -29, -84, -74,  74,  55, -29,
+ 29,  74,  55,  74,  74,   0,  84, -74,  29,  74,  55,  74,  74,   0,  84, -74,
+ 84,  55, -29, -84, -74,  74,  55, -29,  84,  55, -29, -84, -74,  74,  55, -29,
+};
+
+ALIGNED(32) static const int16_t  fast_inverse_dct8_b4_coeff[64] = {
+ 84,  74,  74,   0,  55, -74,  29, -74,  84,  74,  74,   0,  55, -74,  29, -74,
+ 55,  29, -74, -74, -29,  84,  84, -55,  55,  29, -74, -74, -29,  84,  84, -55,
+ 84,  74,  74,   0,  55, -74,  29, -74,  84,  74,  74,   0,  55, -74,  29, -74,
+ 55,  29, -74, -74, -29,  84,  84, -55,  55,  29, -74, -74, -29,  84,  84, -55,
+};
+
+// Coeff arrays for forward B8
+ALIGNED(32) static const int16_t  fast_forward_dct2_b8_coeff[128] = {
+ 64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
+ 64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
+ 64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
+ 64,  64, -75, -89,  36,  83,  18, -75,  64,  64, -75, -89,  36,  83,  18, -75,
+ 64, -64,  50, -89,  36, -83,  18, -50,  64, -64,  50, -89,  36, -83,  18, -50,
+-64,  64,  18,  75,  83, -36,  75, -89, -64,  64,  18,  75,  83, -36,  75, -89,
+ 64, -64, -75, -18, -36,  83,  89, -75,  64, -64, -75, -18, -36,  83,  89, -75,
+-64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dst7_b8_coeff[128] = {
+ 17,  32,  46,  78,  71,  85,  85,  46,  17,  32,  46,  78,  71,  85,  85,  46,
+ 46,  60,  86,  71,  32, -46, -60, -78,  46,  60,  86,  71,  32, -46, -60, -78,
+ 71,  78,  32, -17, -86, -60,  17,  86,  71,  78,  32, -17, -86, -60,  17,  86,
+ 85,  86, -60, -85,  17,  78,  32, -71,  85,  86, -60, -85,  17,  78,  32, -71,
+ 86, -17,  78, -71,  60, -86,  32, -60,  86, -17,  78, -71,  60, -86,  32, -60,
+-85,  32, -17,  85,  71, -17,  78, -86, -85,  32, -17,  85,  71, -17,  78, -86,
+ 78, -46, -60, -32, -46,  85,  85, -71,  78, -46, -60, -32, -46,  85,  85, -71,
+-71,  60,  86, -46, -78,  32,  46, -17, -71,  60,  86, -46, -78,  32,  46, -17,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dct8_b8_coeff[128] = {
+ 86,  85,  85,  60,  78,  17,  71, -32,  86,  85,  85,  60,  78,  17,  71, -32,
+ 78,  71,  17, -32, -60, -86, -86, -17,  78,  71,  17, -32, -60, -86, -86, -17,
+ 60,  46, -71, -86, -46,  32,  78,  60,  60,  46, -71, -86, -46,  32,  78,  60,
+ 32,  17, -78, -46,  85,  71, -46, -85,  32,  17, -78, -46,  85,  71, -46, -85,
+ 60, -71,  46, -86,  32, -78,  17, -46,  60, -71,  46, -86,  32, -78,  17, -46,
+-46,  78,  32,  60,  85, -46,  71, -85, -46,  78,  32,  60,  85, -46,  71, -85,
+ 32, -85, -85,  17, -17,  71,  86, -78,  32, -85, -85,  17, -17,  71,  86, -78,
+-17,  86,  71, -78, -86,  60,  60, -32, -17,  86,  71, -78, -86,  60,  60, -32,
+};
+
+// Coeff arrays for inverse B8
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b8_coeff[128] = {
+ 64,  89,  64,  75,  64,  50,  64,  18,  64,  89,  64,  75,  64,  50,  64,  18,
+ 83,  75,  36, -18, -36, -89, -83, -50,  83,  75,  36, -18, -36, -89, -83, -50,
+ 64,  50, -64, -89, -64,  18,  64,  75,  64,  50, -64, -89, -64,  18,  64,  75,
+ 36,  18, -83, -50,  83,  75, -36, -89,  36,  18, -83, -50,  83,  75, -36, -89,
+ 64, -18,  64, -50,  64, -75,  64, -89,  64, -18,  64, -50,  64, -75,  64, -89,
+-83,  50, -36,  89,  36,  18,  83, -75, -83,  50, -36,  89,  36,  18,  83, -75,
+ 64, -75, -64, -18, -64,  89,  64, -50,  64, -75, -64, -18, -64,  89,  64, -50,
+-36,  89,  83, -75, -83,  50,  36, -18, -36,  89,  83, -75, -83,  50,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b8_coeff[128] = {
+ 17,  46,  32,  78,  46,  86,  60,  71,  17,  46,  32,  78,  46,  86,  60,  71,
+ 71,  85,  85,  46,  32, -60, -46, -78,  71,  85,  85,  46,  32, -60, -46, -78,
+ 86,  78, -17, -71, -85, -17,  32,  85,  86,  78, -17, -71, -85, -17,  32,  85,
+ 60,  32, -86, -60,  71,  78, -17, -86,  60,  32, -86, -60,  71,  78, -17, -86,
+ 71,  32,  78, -17,  85, -60,  86, -85,  71,  32,  78, -17,  85, -60,  86, -85,
+-86,  17, -60,  86,  17,  32,  78, -71, -86,  17, -60,  86,  17,  32,  78, -71,
+ 78, -60, -46, -32, -71,  86,  60, -46,  78, -60, -46, -32, -71,  86,  60, -46,
+-46,  85,  85, -71, -78,  46,  32, -17, -46,  85,  85, -71, -78,  46,  32, -17,
+};
+
+static const int16_t* fast_inverse_dct8_b8_coeff = fast_forward_dct8_b8_coeff; // The table used in forward transform works with inverse also.
+
+// Coeff arrays for forward B16
+ALIGNED(32) static const int16_t  fast_forward_dct2_b16_coeff[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  64, -64,  57, -80,  50, -89,  43, -90,
+ 64,  64,  80,  70,  50,  18,   9, -43, -64,  64, -25,  90,  18,  75,  57,  25,
+ 64,  64,  57,  43, -18, -50, -80, -90,  64, -64,  -9, -87, -75, -18, -87,  70,
+ 64,  64,  25,   9, -75, -89, -70, -25, -64,  64,  43,  70,  89, -50,   9, -80,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  64, -64, -70, -43, -50,  89,  80,  -9,
+ 64,  64, -43, -57, -50, -18,  90,  80, -64,  64,  87,   9, -18, -75, -70,  87,
+ 64,  64, -70, -80,  18,  50,  43,  -9,  64, -64, -90,  25,  75,  18, -25, -57,
+ 64,  64, -87, -90,  75,  89, -57, -87, -64,  64,  80, -57, -89,  50,  90, -43,
+ 83,  36,  80,   9,  75, -18,  70, -43,  36, -83,  25, -70,  18, -50,   9, -25,
+-36, -83, -70, -87, -89, -50, -87,   9,  83, -36,  90, -80,  75, -89,  43, -57,
+-83, -36, -25,  57,  50,  89,  90,  25, -36,  83,  43,   9,  89, -75,  70, -80,
+ 36,  83,  90,  43,  18, -75, -80, -57, -83,  36, -57,  87,  50, -18,  87, -90,
+ 83,  36, -43, -90, -75,  18,  57,  80,  36, -83, -87,  57, -18,  50,  90, -87,
+-36, -83, -57,  25,  89,  50, -25, -90,  83, -36,  -9, -43, -75,  89,  80, -70,
+-83, -36,  87,  70, -50, -89,  -9,  87, -36,  83,  80, -90, -89,  75,  57, -43,
+ 36,  83,  -9, -80, -18,  75,  43, -70, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dst7_b16_coeff[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  88,  -8,  87, -40,  81, -68,  73, -85,  // 0
+ 25,  33,  68,  81,  88,  85,  81,  40, -88,  17, -68,  73, -25,  88,  25,  55,
+ 40,  48,  88,  88,  62,  25, -17, -68,  87, -25,  33, -88, -48, -48, -88,  48,
+ 55,  62,  81,  68, -17, -55, -88, -73, -85,  33,   8,  85,  88, -25,  33, -87,
+ 68,  73,  48,  25, -81, -88, -25,  33,  81, -40, -48, -62, -68,  81,  68,   8,
+ 77,  81,   0, -25, -77, -48,  77,  88, -77,  48,  77,  25,   0, -81, -77,  81,
+ 85,  87, -48, -68,  -8,  33,  62,   8,  73, -55, -88,  17,  68,  25, -17, -62,
+ 88,  88, -81, -88,  68,  87, -48, -85, -68,  62,  81, -55, -88,  48,  88, -40,
+ 68,  88,  77,  77,  85,  55,  88,  25,  62, -88,  48, -81,  33, -62,  17, -33,  // 8
+ 48, -25,   0, -77, -48, -87, -81, -48,  68,  -8,  88, -68,  81, -88,  48, -62,
+-81, -81, -77,   0,  -8,  81,  68,  68, -55,  88,  25,  25,  85, -68,  73, -81,
+-25,  48,  77,  77,  62, -40, -48, -81, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 88,  68,   0, -77, -88, -17,  25,  88,  48, -87, -81,  48, -25,  55,  88, -85,
+  0, -68, -77,   0,  77,  68,   0, -88,  77, -25,   0, -48, -77,  88,  77, -68,
+-88, -48,  77,  77, -33, -88, -25,  81, -40,  85,  81, -88, -87,  73,  55, -40,
+ 25,  81,   0, -77, -25,  73,  48, -68, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dct8_b16_coeff[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  62, -68,  55, -81,  48, -88,  40, -88,  // 0
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -55,  73, -17,  88,  25,  68,  62,  17,
+ 81,  77,  25,   0, -48, -77, -88, -77,  48, -77, -25, -77, -81,   0, -81,  77,
+ 73,  68, -25, -48, -88, -81, -33,  25, -40,  81,  62,  48,  81, -68,  -8, -68,
+ 62,  55, -68, -81, -55, -17,  73,  88,  33, -85, -85,  -8, -25,  88,  87, -33,
+ 48,  40, -88, -88,  25,  62,  68,  17, -25,  87,  88, -33, -48, -48, -48,  88,
+ 33,  25, -81, -68,  85,  88, -40, -81,  17, -88, -73,  68,  88, -25, -55, -25,
+ 17,   8, -48, -25,  73,  40, -87, -55,  -8,  88,  40, -87, -68,  81,  85, -73,
+ 81,  25,  77,   0,  73, -25,  68, -48,  33, -81,  25, -68,  17, -48,   8, -25,  // 8
+-48, -88, -77, -77, -88, -33, -81,  25,  85, -40,  88, -81,  73, -87,  40, -55,
+-68,   0,   0,  77,  68,  77,  88,   0, -25,  77,  48,   0,  88, -77,  68, -77,
+ 68,  88,  77,   0, -17, -88, -88, -25, -87,  48, -48,  81,  55, -25,  85, -88,
+ 48, -25, -77, -77, -40,  62,  81,  48,  17, -73, -88,  68,  -8,  40,  88, -87,
+-81, -81,   0,  77,  81,  -8, -68, -68,  88, -55, -25, -25, -68,  85,  81, -73,
+-25,  48,  77,   0, -87, -48,  48,  81,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 88,  68, -77, -77,  55,  85, -25, -88, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+// Coeff arrays for inverse B16
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b16_coeff[256] = {
+   64,  90,  64,  87,  64,  80,  64,  70,  64,  -9,  64, -25,  64, -43,  64, -57,
+   89,  87,  75,  57,  50,   9,  18, -43, -89,  25, -75,  70, -50,  90, -18,  80,
+   83,  80,  36,   9, -36, -70, -83, -87,  83, -43,  36, -90, -36, -57, -83,  25,
+   75,  70, -18, -43, -89, -87, -50,   9, -75,  57,  18,  80,  89, -25,  50, -90,
+   64,  57, -64, -80, -64, -25,  64,  90,  64, -70, -64, -43, -64,  87,  64,   9,
+   50,  43, -89, -90,  18,  57,  75,  25, -50,  80,  89,  -9, -18, -70, -75,  87,
+   36,  25, -83, -70,  83,  90, -36, -80,  36, -87, -83,  57,  83,  -9, -36, -43,
+   18,   9, -50, -25,  75,  43, -89, -57, -18,  90,  50, -87, -75,  80,  89, -70,
+   64,  57,  64,  43,  64,  25,  64,   9,  64, -70,  64, -80,  64, -87,  64, -90,
+  -18, -80, -50, -90, -75, -70, -89, -25,  18,  43,  50,  -9,  75, -57,  89, -87,
+  -83, -25, -36,  57,  36,  90,  83,  43, -83,  87, -36,  70,  36,  -9,  83, -80,
+   50,  90,  89,  25,  18, -80, -75, -57, -50,  -9, -89,  87, -18,  43,  75, -70,
+   64,  -9, -64, -87, -64,  43,  64,  70,  64, -90, -64,  25, -64,  80,  64, -57,
+  -75, -87, -18,  70,  89,   9, -50, -80,  75, -25,  18, -57, -89,  90,  50, -43,
+  -36,  43,  83,   9, -83, -57,  36,  87, -36,  80,  83, -90, -83,  70,  36, -25,
+   89,  70, -75, -80,  50,  87, -18, -90, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b16_coeff[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  68,  48,  73,  25,  77,   0,  81, -25,  // 0
+ 40,  55,  73,  87,  88,  81,  85,  40, -81, -25, -88,  33, -77,  77, -48,  88,
+ 68,  77,  88,  77,  48,   0, -25, -77,  88,   0,  68, -77,   0, -77, -68,   0,
+ 85,  88,  55,  25, -48, -81, -87, -48, -88,  25, -17,  88,  77,   0,  68, -88,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  81, -48, -40, -62, -77,  77,  48,  25,
+ 81,  73, -68, -85, -25,  25,  88,  55, -68,  68,  81,   8,   0, -77, -81,  81,
+ 62,  48, -88, -81,  68,  88,  -8, -68,  48, -81, -87,  48,  77,   0, -25, -48,
+ 33,  17, -62, -33,  81,  48, -88, -62, -25,  88,  55, -85, -77,  77,  88, -68,
+ 40,  88,  48,  88,  55,  81,  62,  68,  85, -48,  87, -68,  88, -81,  88, -88,  // 8
+ 62, -17,  25, -68, -17, -88, -55, -73,  -8,  62,  33,   8,  68, -48,  87, -85,
+-81, -77, -81,   0, -25,  77,  48,  77, -88,  77, -48,  77,  25,   0,  81, -77,
+ -8,  68,  81,  68,  62, -48, -40, -81, -33, -25, -88,  81, -25,  48,  73, -68,
+ 87,  33, -25, -88, -85,   8,  33,  85,  73, -88, -55,  17, -68,  81,  62, -55,
+-48, -88, -48,  48,  88,  33, -25, -87,  68, -17,  25, -62, -88,  88,  48, -40,
+-55,  25,  88,  25, -73, -68,  17,  88, -40,  81,  85, -88, -81,  68,  33, -25,
+ 85,  73, -68, -81,  40,  87,  -8, -88, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+static const int16_t* fast_inverse_dct8_b16_coeff = fast_forward_dct8_b16_coeff;
+
+// Coeff arrays for forward B32
+ALIGNED(32) static const int16_t  fast_forward_dct2_b32_coeff[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dst7_b32_coeff[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  fast_forward_dct8_b32_coeff[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+// Coeff arrays for inverse B32
+ALIGNED(32) static const int16_t  fast_inverse_dct2_b32_coeff[1024] = {
+   64,  90,  64,  90,  64,  88,  64,  85,  64,  82,  64,  78,  64,  73,  64,  67,  // 0
+ 64,  61,  64,  54,  64,  46,  64,  38,  64,  31,  64,  22,  64,  13,  64,   4,
+ 64,  -4,  64, -13,  64, -22,  64, -31,  64, -38,  64, -46,  64, -54,  64, -61,
+ 64, -67,  64, -73,  64, -78,  64, -82,  64, -85,  64, -88,  64, -90,  64, -90,
+ 90,  90,  87,  82,  80,  67,  70,  46,  57,  22,  43,  -4,  25, -31,   9, -54,
+ -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13,
+-90,  13, -87,  38, -80,  61, -70,  78, -57,  88, -43,  90, -25,  85,  -9,  73,
+  9,  54,  25,  31,  43,   4,  57, -22,  70, -46,  80, -67,  87, -82,  90, -90,
+ 89,  88,  75,  67,  50,  31,  18, -13, -18, -54, -50, -82, -75, -90, -89, -78,  // 8
+-89, -46, -75,  -4, -50,  38, -18,  73,  18,  90,  50,  85,  75,  61,  89,  22,
+ 89, -22,  75, -61,  50, -85,  18, -90, -18, -73, -50, -38, -75,   4, -89,  46,
+-89,  78, -75,  90, -50,  82, -18,  54,  18,  13,  50, -31,  75, -67,  89, -88,
+ 87,  85,  57,  46,   9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25,  38,
+ 25,  82,  70,  88,  90,  54,  80,  -4,  43, -61,  -9, -90, -57, -78, -87, -31,
+-87,  31, -57,  78,  -9,  90,  43,  61,  80,   4,  90, -54,  70, -88,  25, -82,
+-25, -38, -70,  22, -90,  73, -80,  90, -43,  67,   9,  13,  57, -46,  87, -85,
+ 83,  82,  36,  22, -36, -54, -83, -90, -83, -61, -36,  13,  36,  78,  83,  85,  // 16
+ 83,  31,  36, -46, -36, -90, -83, -67, -83,   4, -36,  73,  36,  88,  83,  38,
+ 83, -38,  36, -88, -36, -73, -83,  -4, -83,  67, -36,  90,  36,  46,  83, -31,
+ 83, -85,  36, -78, -36, -13, -83,  61, -83,  90, -36,  54,  36, -22,  83, -82,
+ 80,  78,   9,  -4, -70, -82, -87, -73, -25,  13,  57,  85,  90,  67,  43, -22,
+-43, -88, -90, -61, -57,  31,  25,  90,  87,  54,  70, -38,  -9, -90, -80, -46,
+-80,  46,  -9,  90,  70,  38,  87, -54,  25, -90, -57, -31, -90,  61, -43,  88,
+ 43,  22,  90, -67,  57, -85, -25, -13, -87,  73, -70,  82,   9,   4,  80, -78,
+ 75,  73, -18, -31, -89, -90, -50, -22,  50,  78,  89,  67,  18, -38, -75, -90,  // 24
+-75, -13,  18,  82,  89,  61,  50, -46, -50, -88, -89,  -4, -18,  85,  75,  54,
+ 75, -54, -18, -85, -89,   4, -50,  88,  50,  46,  89, -61,  18, -82, -75,  13,
+-75,  90,  18,  38,  89, -67,  50, -78, -50,  22, -89,  90, -18,  31,  75, -73,
+ 70,  67, -43, -54, -87, -78,   9,  38,  90,  85,  25, -22, -80, -90, -57,   4,
+ 57,  90,  80,  13, -25, -88, -90, -31,  -9,  82,  87,  46,  43, -73, -70, -61,
+-70,  61,  43,  73,  87, -46,  -9, -82, -90,  31, -25,  88,  80, -13,  57, -90,
+-57,  -4, -80,  90,  25,  22,  90, -85,   9, -38, -87,  78, -43,  54,  70, -67,
+ 64,  61, -64, -73, -64, -46,  64,  82,  64,  31, -64, -88, -64, -13,  64,  90,  // 32
+ 64,  -4, -64, -90, -64,  22,  64,  85,  64, -38, -64, -78, -64,  54,  64,  67,
+ 64, -67, -64, -54, -64,  78,  64,  38,  64, -85, -64, -22, -64,  90,  64,   4,
+ 64, -90, -64,  13, -64,  88,  64, -31,  64, -82, -64,  46, -64,  73,  64, -61,
+ 57,  54, -80, -85, -25,  -4,  90,  88,  -9, -46, -87, -61,  43,  82,  70,  13,
+-70, -90, -43,  38,  87,  67,   9, -78, -90, -22,  25,  90,  80, -31, -57, -73,
+-57,  73,  80,  31,  25, -90, -90,  22,   9,  78,  87, -67, -43, -38, -70,  90,
+ 70, -13,  43, -82, -87,  61,  -9,  46,  90, -88, -25,   4, -80,  85,  57, -54,
+ 50,  46, -89, -90,  18,  38,  75,  54, -75, -90, -18,  31,  89,  61, -50, -88,  // 40
+-50,  22,  89,  67, -18, -85, -75,  13,  75,  73,  18, -82, -89,   4,  50,  78,
+ 50, -78, -89,  -4,  18,  82,  75, -73, -75, -13, -18,  85,  89, -67, -50, -22,
+-50,  88,  89, -61, -18, -31, -75,  90,  75, -54,  18, -38, -89,  90,  50, -46,
+ 43,  38, -90, -88,  57,  73,  25,  -4, -87, -67,  70,  90,   9, -46, -80, -31,
+ 80,  85,  -9, -78, -70,  13,  87,  61, -25, -90, -57,  54,  90,  22, -43, -82,
+-43,  82,  90, -22, -57, -54, -25,  90,  87, -61, -70, -13,  -9,  78,  80, -85,
+-80,  31,   9,  46,  70, -90, -87,  67,  25,   4,  57, -73, -90,  88,  43, -38,
+ 36,  31, -83, -78,  83,  90, -36, -61, -36,   4,  83,  54, -83, -88,  36,  82,  // 48
+ 36, -38, -83, -22,  83,  73, -36, -90, -36,  67,  83, -13, -83, -46,  36,  85,
+ 36, -85, -83,  46,  83,  13, -36, -67, -36,  90,  83, -73, -83,  22,  36,  38,
+ 36, -82, -83,  88,  83, -54, -36,  -4, -36,  61,  83, -90, -83,  78,  36, -31,
+ 25,  22, -70, -61,  90,  85, -80, -90,  43,  73,   9, -38, -57,  -4,  87,  46,
+-87, -78,  57,  90,  -9, -82, -43,  54,  80, -13, -90, -31,  70,  67, -25, -88,
+-25,  88,  70, -67, -90,  31,  80,  13, -43, -54,  -9,  82,  57, -90, -87,  78,
+ 87, -46, -57,   4,   9,  38,  43, -73, -80,  90,  90, -85, -70,  61,  25, -22,
+ 18,  13, -50, -38,  75,  61, -89, -78,  89,  88, -75, -90,  50,  85, -18, -73,  // 56
+-18,  54,  50, -31, -75,   4,  89,  22, -89, -46,  75,  67, -50, -82,  18,  90,
+ 18, -90, -50,  82,  75, -67, -89,  46,  89, -22, -75,  -4,  50,  31, -18, -54,
+-18,  73,  50, -85, -75,  90,  89, -88, -89,  78,  75, -61, -50,  38,  18, -13,
+  9,   4, -25, -13,  43,  22, -57, -31,  70,  38, -80, -46,  87,  54, -90, -61,
+ 90,  67, -87, -73,  80,  78, -70, -82,  57,  85, -43, -88,  25,  90,  -9, -90,
+ -9,  90,  25, -90, -43,  88,  57, -85, -70,  82,  80, -78, -87,  73,  90, -67,
+-90,  61,  87, -54, -80,  46,  70, -38, -57,  31,  43, -22, -25,  13,   9,  -4,
+};
+
+ALIGNED(32) static const int16_t  fast_inverse_dst7_b32_coeff[1024] = {
+  4,  13,   9,  26,  13,  38,  17,  50,  21,  60,  26,  68,  30,  77,  34,  82,  // 0
+ 38,  86,  42,  89,  46,  90,  50,  88,  53,  85,  56,  80,  60,  74,  63,  66,
+ 66,  56,  68,  46,  72,  34,  74,  21,  77,   9,  78,  -4,  80, -17,  82, -30,
+ 84, -42,  85, -53,  86, -63,  87, -72,  88, -78,  89, -84,  90, -87,  90, -90,
+ 21,  30,  42,  56,  60,  77,  74,  87,  84,  89,  89,  80,  89,  63,  84,  38,
+ 74,   9,  60, -21,  42, -50,  21, -72,   0, -85, -21, -90, -42, -84, -60, -68,
+-74, -46, -84, -17, -89,  13, -89,  42, -84,  66, -74,  82, -60,  90, -42,  86,
+-21,  74,   0,  53,  21,  26,  42,  -4,  60, -34,  74, -60,  84, -78,  89, -88,
+ 38,  46,  68,  78,  86,  90,  88,  77,  74,  42,  46,  -4,   9, -50, -30, -80,  // 8
+-63, -90, -84, -74, -90, -38, -78,   9, -53,  53, -17,  82,  21,  89,  56,  72,
+ 80,  34,  90, -13,  82, -56,  60, -84,  26, -88, -13, -68, -50, -30, -77,  17,
+-89,  60, -85,  85, -66,  87, -34,  66,   4,  26,  42, -21,  72, -63,  87, -86,
+ 53,  60,  85,  89,  85,  74,  53,  21,   0, -42, -53, -84, -85, -84, -85, -42,
+-53,  21,   0,  74,  53,  89,  85,  60,  85,   0,  53, -60,   0, -89, -53, -74,
+-85, -21, -85,  42, -53,  84,   0,  84,  53,  42,  85, -21,  85, -74,  53, -89,
+  0, -60, -53,   0, -85,  60, -85,  89, -53,  74,   0,  21,  53, -42,  85, -84,
+ 66,  72,  90,  86,  56,  34, -13, -46, -74, -89, -87, -63, -46,  13,  26,  78,  // 16
+ 80,  82,  84,  21,  34, -56, -38, -90, -85, -53, -78,  26, -21,  84,  50,  77,
+ 88,   9,  72, -66,   9, -88, -60, -42, -90,  38, -63,  87,   4,  68,  68,  -4,
+ 89, -74,  53, -85, -17, -30, -77,  50, -86,  90, -42,  60,  30, -17,  82, -80,
+ 77,  80,  80,  72,   9, -17, -72, -86, -84, -60, -17,  34,  66,  90,  86,  46,
+ 26, -50, -60, -89, -88, -30, -34,  63,  53,  85,  90,  13,  42, -74, -46, -78,
+-90,   4, -50,  82,  38,  68,  89, -21,  56, -87, -30, -56, -87,  38, -63,  90,
+ 21,  42,  85, -53,  68, -88, -13, -26, -82,  66, -74,  84,   4,   9,  78, -77,
+ 84,  86,  60,  46, -42, -63, -89, -78, -21,  21,  74,  90,  74,  26, -21, -77,  // 24
+-89, -66, -42,  42,  60,  87,  84,   4,   0, -85, -84, -50, -60,  60,  42,  80,
+ 89, -17,  21, -90, -74, -30, -74,  74,  21,  68,  89, -38,  42, -88, -60,  -9,
+-84,  84,   0,  53,  84, -56,  60, -82, -42,  13, -89,  89, -21,  34,  74, -72,
+ 88,  90,  30,  13, -78, -87, -56, -26,  60,  84,  77,  38, -34, -78, -87, -50,
+  4,  72,  89,  60,  26, -63, -80, -68, -53,  53,  63,  77,  74, -42, -38, -82,
+-86,  30,   9,  86,  90, -17,  21, -89, -82,   4, -50,  90,  66,   9,  72, -88,
+-42, -21, -85,  85,  13,  34,  90, -80,  17, -46, -84,  74, -46,  56,  68, -66,
+ 90,  89,  -4, -21, -90, -84,   9,  42,  89,  74, -13, -60, -88, -60,  17,  74,  // 32
+ 87,  42, -21, -84, -86, -21,  26,  89,  85,   0, -30, -89, -84,  21,  34,  84,
+ 82, -42, -38, -74, -80,  60,  42,  60,  78, -74, -46, -42, -77,  84,  50,  21,
+ 74, -89, -53,   0, -72,  89,  56, -21,  68, -84, -60,  42, -66,  74,  63, -60,
+ 87,  85, -38, -53, -72, -53,  68,  85,  42,   0, -86, -85,  -4,  53,  88,  53,
+-34, -85, -74,   0,  66,  85,  46, -53, -85, -53,  -9,  85,  89,   0, -30, -85,
+-77,  53,  63,  53,  50, -85, -84,   0, -13,  85,  90, -53, -26, -53, -78,  85,
+ 60,   0,  53, -85, -82,  53, -17,  53,  90, -85, -21,   0, -80,  85,  56, -53,
+ 82,  78, -66, -77, -30,  -4,  90,  80, -42, -74, -56,  -9,  86,  82, -13, -72,  // 40
+-77, -13,  74,  84,  17, -68, -87, -17,  53,  85,  46, -66, -89, -21,  26,  86,
+ 68, -63, -80, -26,  -4,  87,  84, -60, -63, -30, -34,  88,  90, -56, -38, -34,
+-60,  89,  85, -53,  -9, -38, -78,  90,  72, -50,  21, -42, -88,  90,  50, -46,
+ 74,  68, -84, -88,  21,  46,  60,  30, -89, -84,  42,  78,  42, -17, -89, -56,
+ 60,  90,  21, -60, -84, -13,  74,  77,   0, -85, -74,  34,  84,  42, -21, -87,
+-60,  72,  89,  -4, -42, -66, -42,  89,  89, -50, -60, -26, -21,  82,  84, -80,
+-74,  21,   0,  53,  74, -90, -84,  63,  21,   9,  60, -74, -89,  86,  42, -38,
+ 63,  56, -90, -87,  66,  80,  -4, -38, -60, -21,  90,  72, -68, -90,   9,  68,  // 48
+ 56, -17, -89, -42,  72,  82, -13, -86, -53,  53,  88,   4, -74, -60,  17,  88,
+ 50, -78, -87,  34,  77,  26, -21, -74, -46,  90,  86, -66, -78,  13,  26,  46,
+ 42, -84, -85,  85,  80, -50, -30,  -9, -38,  63,  84, -89, -82,  77,  34, -30,
+ 50,  42, -82, -74,  88,  89, -66, -84,  21,  60,  30, -21, -72, -21,  90,  60,
+-78, -84,  42,  89,   9, -74, -56,  42,  85,   0, -86, -42,  60,  74, -13, -89,
+-38,  84,  77, -60, -90,  21,  74,  21, -34, -60, -17,  84,  63, -89, -87,  74,
+ 84, -42, -53,   0,   4,  42,  46, -74, -80,  89,  89, -84, -68,  60,  26, -21,
+ 34,  26, -63, -50,  82,  68, -90, -82,  84,  89, -66, -88,  38,  80,  -4, -66,  // 56
+-30,  46,  60, -21, -80,  -4,  90,  30, -85, -53,  68,  72, -42, -84,   9,  90,
+ 26, -87, -56,  78,  78, -63, -89,  42,  86, -17, -72,  -9,  46,  34, -13, -56,
+-21,  74,  53, -85, -77,  90,  88, -86, -87,  77,  74, -60, -50,  38,  17, -13,
+ 17,   9, -34, -17,  50,  26, -63, -34,  74,  42, -82, -50,  87,  56, -90, -63,
+ 88,  68, -84, -74,  77,  78, -66, -82,  53,  85, -38, -87,  21,  89,  -4, -90,
+-13,  90,  30, -88, -46,  86,  60, -84, -72,  80,  80, -77, -86,  72,  90, -66,
+-89,  60,  85, -53, -78,  46,  68, -38, -56,  30,  42, -21, -26,  13,   9,  -4,
+};
+
+static const int16_t* fast_inverse_dct8_b32_coeff = fast_forward_dct8_b32_coeff;
+
+
+// Shuffle tables for advanced and optimized avx2 functions
+
+// Shuffle 16 bit samples inside lanes. Put each sample four spaces from each other adjacent to each other.
+// _mm256_shuffle_epi8
+// Input  [0 1 2 3 4 5 6 7 | XX
+// Output [0 4 1 5 2 6 3 7 | XX
+ALIGNED(32) static const int8_t shuffle_16b_0415[32] = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+// Shuffle 16 bit samples inside lanes. Put each even indexed sample next to each other, then each odd sample.
+// _mm256_shuffle_epi8
+// Input  [0 1 2 3 4 5 6 7 |
+// Output [0 2 4 6 1 3 5 7 |
+ALIGNED(32) static const int8_t shuffle_16b_0246[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+// Permute 32 bit samples across lanes. Put each sample four spaces from each other adjacent to each other.
+// _mm256_permutevar8x32_epi32
+// Input  [0 1 2 3 | 4 5 6 7]
+// Output [0 1 4 5 | 2 6 3 7]
+ALIGNED(32) static const int32_t permute_32b_0415[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+
+          static const int8_t* fi_tr_2x8_shuffle_hor = shuffle_16b_0415;
+
+ALIGNED(32) static const int8_t  fi_tr_2x8_result_shuffle1_ver[32] = {
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_2x8_shuffle_ver[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_2x8_result_shuffle_ver[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+ALIGNED(32) static const int8_t  fi_tr_2x8_result_shuffle2_ver[32] = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_2x16_ver_result_shuffle[32] = {
+   0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
+   0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  fi_tr_4x4_shuffle_hor[32] = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  fi_tr_4x4_result_shuffle_ver[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  fi_tr_4x8_result_shuffle_ver[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_8x2_ver_pass_shuffle[32] = {
+   0,  1,  8,  9,  2,  3, 10, 11, 4,  5, 12, 13,  6,  7, 14, 15,
+   0,  1,  8,  9,  2,  3, 10, 11, 4,  5, 12, 13,  6,  7, 14, 15
+};
+
+ALIGNED(32) static const int8_t  fi_tr_8x2_shuffle_hor[32] = {
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  fi_tr_8x2_shuffle_ver[32] = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+          static const int8_t* fi_tr_8x2_res_shuffle_ver = shuffle_16b_0415;
+
+ALIGNED(32) static const int8_t  ff_dct2_8x4_ver_pass_shuffle[32] = {
+   0,  1,  8,  9,  4,  5, 12, 13, 2,  3, 10, 11,  6,  7, 14, 15,
+   0,  1,  8,  9,  4,  5, 12, 13, 2,  3, 10, 11,  6,  7, 14, 15,
+};
+
+// TODO: remove duplicate tables. Rename with a more descriptive name.
+ALIGNED(32) static const int8_t  ff_dct2_8x4_ver_pass_result_shuffle[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7, 8,  9, 12, 13, 10, 11, 14, 15,
+   0,  1,  4,  5,  2,  3,  6,  7, 8,  9, 12, 13, 10, 11, 14, 15,
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_8x16_butterfly_shuffle[32] = {
+   0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
+  16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_8x16_butterfly_shuffle_order[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+// Arrange samples into butterfly formation
+ALIGNED(32) static const int8_t  ff_dct2_16x8_butterfly_shuffle[32] = {
+   0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
+  16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
+};
+
+// Swap two middle 16-bit values in each 64-bit chunk
+ALIGNED(32) static const int8_t  ff_dct2_16x8_butterfly_res_shuffle_ver[32] = {
+   0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15,
+  16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_16x32_reverse_64b_order[32] = {
+  6,  7,  4,  5,  2,  3,  0,  1,  14, 15, 12, 13, 10, 11,  8,  9,
+  22, 23, 20, 21, 18, 19, 16, 17, 30, 31, 28, 29, 26, 27, 24, 25,
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_32x2_butterfly_order_shuffle[32] = {
+  14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1,
+  30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_32x8_shuffle_order[32] = {
+   0,  1, 14, 15,  2,  3, 12, 13,  4,  5, 10, 11,  6,  7,  8,  9,
+  16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25
+};
+
+ALIGNED(32) static const int8_t  ff_dct2_32x8_shuffle_result[32] = {
+   0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15,
+  16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+};
+
+
+// Coeff tables for advanced and optimized avx2 functions
+
+// 2xN
+ALIGNED(32) static const int16_t  ff_dct2_2xN_coeff_hor[32] = {
+ 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64, 64,  64,
+ 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64
+};
+
+ALIGNED(32) static const int16_t  ff_dct2_2x8_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 89,  75,  50,  18, -18, -50, -75, -89,  89,  75,  50,  18, -18, -50, -75, -89,
+ 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
+ 75, -18, -89, -50,  50,  89,  18, -75,  75, -18, -89, -50,  50,  89,  18, -75,
+ 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
+ 50, -89,  18,  75, -75, -18,  89, -50,  50, -89,  18,  75, -75, -18,  89, -50,
+ 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
+ 18, -50,  75, -89,  89, -75,  50, -18,  18, -50,  75, -89,  89, -75,  50, -18
+};
+
+ALIGNED(32) static
+const int16_t ff_dst7_2x8_coeff_ver[128] = {
+  17, 32,  46,  60,  71,  78,  85,  86,  17, 32,  46,  60,  71,  78,  85,  86,
+  46, 78,  86,  71,  32,  -17, -60, -85, 46, 78,  86,  71,  32,  -17, -60, -85,
+  71, 85,  32,  -46, -86, -60, 17,  78,  71, 85,  32,  -46, -86, -60, 17,  78,
+  85, 46,  -60, -78, 17,  86,  32,  -71, 85, 46,  -60, -78, 17,  86,  32,  -71,
+  86, -17, -85, 32,  78,  -46, -71, 60,  86, -17, -85, 32,  78,  -46, -71, 60,
+  78, -71, -17, 85,  -60, -32, 86,  -46, 78, -71, -17, 85,  -60, -32, 86,  -46,
+  60, -86, 71,  -17, -46, 85,  -78, 32,  60, -86, 71,  -17, -46, 85,  -78, 32,
+  32, -60, 78,  -86, 85,  -71, 46,  -17, 32, -60, 78,  -86, 85,  -71, 46,  -17,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_2x8_coeff_ver[128] = {
+ 64,  89,  83,  75,  64,  75,  36, -18,  64,  89,  83,  75,  64,  75,  36, -18,
+ 64,  50,  36,  18, -64, -89, -83, -50,  64,  50,  36,  18, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  18, -83, -50,  64,  50, -36, -89,  64,  18, -83, -50,
+-64,  18,  83,  75,  64,  75, -36, -89, -64,  18,  83,  75,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -50, -36,  89,  64, -18, -83,  50,  64, -50, -36,  89,
+ 64, -75, -36,  89, -64, -18,  83, -75,  64, -75, -36,  89, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -89,  83, -75,  64, -75,  36,  18,  64, -89,  83, -75,
+-64,  89, -83,  50,  64, -50,  36, -18, -64,  89, -83,  50,  64, -50,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_2x8_coeff_ver[128] = {
+ 17,  46,  71,  85,  32,  78,  85,  46,  17,  46,  71,  85,  32,  78,  85,  46,
+ 86,  78,  60,  32, -17, -71, -86, -60,  86,  78,  60,  32, -17, -71, -86, -60,
+ 46,  86,  32, -60,  60,  71, -46, -78,  46,  86,  32, -60,  60,  71, -46, -78,
+-85, -17,  71,  78,  32,  85, -17, -86, -85, -17,  71,  78,  32,  85, -17, -86,
+ 71,  32, -86,  17,  78, -17, -60,  86,  71,  32, -86,  17,  78, -17, -60,  86,
+ 78, -60, -46,  85, -46, -32,  85, -71,  78, -60, -46,  85, -46, -32,  85, -71,
+ 85, -60,  17,  32,  86, -85,  78, -71,  85, -60,  17,  32,  86, -85,  78, -71,
+-71,  86, -78,  46,  60, -46,  32, -17, -71,  86, -78,  46,  60, -46,  32, -17,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_2x8_coeff_ver[128] = {
+ 86,  85,  78,  71,  85,  60,  17, -32,  86,  85,  78,  71,  85,  60,  17, -32,
+ 60,  46,  32,  17, -71, -86, -78, -46,  60,  46,  32,  17, -71, -86, -78, -46,
+ 78,  17, -60, -86,  71, -32, -86, -17,  78,  17, -60, -86,  71, -32, -86, -17,
+-46,  32,  85,  71,  78,  60, -46, -85, -46,  32,  85,  71,  78,  60, -46, -85,
+ 60, -71, -46,  78,  46, -86,  32,  60,  60, -71, -46,  78,  46, -86,  32,  60,
+ 32, -85, -17,  86, -85,  17,  71, -78,  32, -85, -17,  86, -85,  17,  71, -78,
+ 32, -78,  85, -46,  17, -46,  71, -85,  32, -78,  85, -46,  17, -46,  71, -85,
+-17,  71, -86,  60,  86, -78,  60, -32, -17,  71, -86,  60,  86, -78,  60, -32,
+};
+
+
+
+ALIGNED(32) static const int16_t  fi_dct2_2x16_coeff_ver[512] = {
+ 64,  90,  89,  87,  64,  90,  89,  87,  64,  57,  50,  43,  64,  57,  50,  43,  // 0
+ 83,  80,  75,  70,  83,  80,  75,  70,  36,  25,  18,   9,  36,  25,  18,   9,
+ 64,  87,  75,  57,  64,  87,  75,  57, -64, -80, -89, -90, -64, -80, -89, -90,
+ 36,   9, -18, -43,  36,   9, -18, -43, -83, -70, -50, -25, -83, -70, -50, -25,
+ 64,  80,  50,   9,  64,  80,  50,   9, -64, -25,  18,  57, -64, -25,  18,  57,
+-36, -70, -89, -87, -36, -70, -89, -87,  83,  90,  75,  43,  83,  90,  75,  43,
+ 64,  70,  18, -43,  64,  70,  18, -43,  64,  90,  75,  25,  64,  90,  75,  25,
+-83, -87, -50,   9, -83, -87, -50,   9, -36, -80, -89, -57, -36, -80, -89, -57,
+ 64,  57, -18, -80,  64,  57, -18, -80,  64,  -9, -75, -87,  64,  -9, -75, -87,  // 8
+-83, -25,  50,  90, -83, -25,  50,  90, -36,  43,  89,  70, -36,  43,  89,  70,
+ 64,  43, -50, -90,  64,  43, -50, -90, -64, -87, -18,  70, -64, -87, -18,  70,
+-36,  57,  89,  25, -36,  57,  89,  25,  83,   9, -75, -80,  83,   9, -75, -80,
+ 64,  25, -75, -70,  64,  25, -75, -70, -64,  43,  89,   9, -64,  43,  89,   9,
+ 36,  90,  18, -80,  36,  90,  18, -80, -83, -57,  50,  87, -83, -57,  50,  87,
+ 64,   9, -89, -25,  64,   9, -89, -25,  64,  70, -50, -80,  64,  70, -50, -80,
+ 83,  43, -75, -57,  83,  43, -75, -57,  36,  87, -18, -90,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  64,  -9, -89,  25,  64, -70, -50,  80,  64, -70, -50,  80,  // 16
+ 83, -43, -75,  57,  83, -43, -75,  57,  36, -87, -18,  90,  36, -87, -18,  90,
+ 64, -25, -75,  70,  64, -25, -75,  70, -64, -43,  89,  -9, -64, -43,  89,  -9,
+ 36, -90,  18,  80,  36, -90,  18,  80, -83,  57,  50, -87, -83,  57,  50, -87,
+ 64, -43, -50,  90,  64, -43, -50,  90, -64,  87, -18, -70, -64,  87, -18, -70,
+-36, -57,  89, -25, -36, -57,  89, -25,  83,  -9, -75,  80,  83,  -9, -75,  80,
+ 64, -57, -18,  80,  64, -57, -18,  80,  64,   9, -75,  87,  64,   9, -75,  87,
+-83,  25,  50, -90, -83,  25,  50, -90, -36, -43,  89, -70, -36, -43,  89, -70,
+ 64, -70,  18,  43,  64, -70,  18,  43,  64, -90,  75, -25,  64, -90,  75, -25,  // 24
+-83,  87, -50,  -9, -83,  87, -50,  -9, -36,  80, -89,  57, -36,  80, -89,  57,
+ 64, -80,  50,  -9,  64, -80,  50,  -9, -64,  25,  18, -57, -64,  25,  18, -57,
+-36,  70, -89,  87, -36,  70, -89,  87,  83, -90,  75, -43,  83, -90,  75, -43,
+ 64, -87,  75, -57,  64, -87,  75, -57, -64,  80, -89,  90, -64,  80, -89,  90,
+ 36,  -9, -18,  43,  36,  -9, -18,  43, -83,  70, -50,  25, -83,  70, -50,  25,
+ 64, -90,  89, -87,  64, -90,  89, -87,  64, -57,  50, -43,  64, -57,  50, -43,
+ 83, -80,  75, -70,  83, -80,  75, -70,  36, -25,  18,  -9,  36, -25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t fi_dst7_2x16_coeff_ver[512] = {
+  8,  25,  40,  55,   8,  25,  40,  55,  88,  87,  81,  73,  88,  87,  81,  73,  // 0
+ 68,  77,  85,  88,  68,  77,  85,  88,  62,  48,  33,  17,  62,  48,  33,  17, 
+ 17,  48,  73,  87,  17,  48,  73,  87,  -8, -40, -68, -85,  -8, -40, -68, -85, 
+ 88,  77,  55,  25,  88,  77,  55,  25, -88, -81, -62, -33, -88, -81, -62, -33, 
+ 25,  68,  88,  81,  25,  68,  88,  81, -88, -68, -25,  25, -88, -68, -25,  25, 
+ 48,   0, -48, -81,  48,   0, -48, -81,  68,  88,  81,  48,  68,  88,  81,  48, 
+ 33,  81,  85,  40,  33,  81,  85,  40,  17,  73,  88,  55,  17,  73,  88,  55, 
+-25, -77, -87, -48, -25, -77, -87, -48,  -8, -68, -88, -62,  -8, -68, -88, -62, 
+ 40,  88,  62, -17,  40,  88,  62, -17,  87,  33, -48, -88,  87,  33, -48, -88,  // 8
+-81, -77,  -8,  68, -81, -77,  -8,  68, -55,  25,  85,  73, -55,  25,  85,  73, 
+ 48,  88,  25, -68,  48,  88,  25, -68, -25, -88, -48,  48, -25, -88, -48,  48, 
+-81,   0,  81,  68, -81,   0,  81,  68,  88,  25, -68, -81,  88,  25, -68, -81, 
+ 55,  81, -17, -88,  55,  81, -17, -88, -85,   8,  88,  33, -85,   8,  88,  33, 
+-25,  77,  62, -48, -25,  77,  62, -48, -73, -68,  40,  87, -73, -68,  40,  87, 
+ 62,  68, -55, -73,  62,  68, -55, -73,  33,  85, -25, -87,  33,  85, -25, -87, 
+ 48,  77, -40, -81,  48,  77, -40, -81,  17,  88,  -8, -88,  17,  88,  -8, -88, 
+ 68,  48, -81, -25,  68,  48, -81, -25,  81, -48, -68,  68,  81, -48, -68,  68,  // 16
+ 88,   0, -88,  25,  88,   0, -88,  25,  48, -81, -25,  88,  48, -81, -25,  88, 
+ 73,  25, -88,  33,  73,  25, -88,  33, -40, -62,  81,   8, -40, -62,  81,   8, 
+ 68, -77, -17,  88,  68, -77, -17,  88, -87,  48,  55, -85, -87,  48,  55, -85, 
+ 77,   0, -77,  77,  77,   0, -77,  77, -77,  77,   0, -77, -77,  77,   0, -77, 
+  0, -77,  77,   0,   0, -77,  77,   0,  77,   0, -77,  77,  77,   0, -77,  77, 
+ 81, -25, -48,  88,  81, -25, -48,  88,  48,  25, -81,  81,  48,  25, -81,  81, 
+-68,   0,  68, -88, -68,   0,  68, -88, -25, -48,  88, -68, -25, -48,  88, -68, 
+ 85, -48,  -8,  62,  85, -48,  -8,  62,  73, -88,  68, -17,  73, -88,  68, -17,  // 24
+-88,  77, -33, -25, -88,  77, -33, -25, -40,  81, -87,  55, -40,  81, -87,  55, 
+ 87, -68,  33,   8,  87, -68,  33,   8, -55,  17,  25, -62, -55,  17,  25, -62, 
+-48,  77, -88,  81, -48,  77, -88,  81,  85, -88,  73, -40,  85, -88,  73, -40, 
+ 88, -81,  68, -48,  88, -81,  68, -48, -68,  81, -88,  88, -68,  81, -88,  88, 
+ 25,   0, -25,  48,  25,   0, -25,  48, -81,  68, -48,  25, -81,  68, -48,  25, 
+ 88, -88,  87, -85,  88, -88,  87, -85,  62, -55,  48, -40,  62, -55,  48, -40, 
+ 81, -77,  73, -68,  81, -77,  73, -68,  33, -25,  17,  -8,  33, -25,  17,  -8, 
+};
+
+ALIGNED(32) static const int16_t  fi_dct2_2x32_coeff_ver[2048] = {
+ 64,  90,  90,  90,  89,  88,  87,  85,  64,  90,  90,  90,  89,  88,  87,  85,  // 0
+ 83,  82,  80,  78,  75,  73,  70,  67,  83,  82,  80,  78,  75,  73,  70,  67,
+ 64,  61,  57,  54,  50,  46,  43,  38,  64,  61,  57,  54,  50,  46,  43,  38,
+ 36,  31,  25,  22,  18,  13,   9,   4,  36,  31,  25,  22,  18,  13,   9,   4,
+ 64,  90,  87,  82,  75,  67,  57,  46,  64,  90,  87,  82,  75,  67,  57,  46,
+ 36,  22,   9,  -4, -18, -31, -43, -54,  36,  22,   9,  -4, -18, -31, -43, -54,
+-64, -73, -80, -85, -89, -90, -90, -88, -64, -73, -80, -85, -89, -90, -90, -88,
+-83, -78, -70, -61, -50, -38, -25, -13, -83, -78, -70, -61, -50, -38, -25, -13,
+ 64,  88,  80,  67,  50,  31,   9, -13,  64,  88,  80,  67,  50,  31,   9, -13,  // 8
+-36, -54, -70, -82, -89, -90, -87, -78, -36, -54, -70, -82, -89, -90, -87, -78,
+-64, -46, -25,  -4,  18,  38,  57,  73, -64, -46, -25,  -4,  18,  38,  57,  73,
+ 83,  90,  90,  85,  75,  61,  43,  22,  83,  90,  90,  85,  75,  61,  43,  22,
+ 64,  85,  70,  46,  18, -13, -43, -67,  64,  85,  70,  46,  18, -13, -43, -67,
+-83, -90, -87, -73, -50, -22,   9,  38, -83, -90, -87, -73, -50, -22,   9,  38,
+ 64,  82,  90,  88,  75,  54,  25,  -4,  64,  82,  90,  88,  75,  54,  25,  -4,
+-36, -61, -80, -90, -89, -78, -57, -31, -36, -61, -80, -90, -89, -78, -57, -31,
+ 64,  82,  57,  22, -18, -54, -80, -90,  64,  82,  57,  22, -18, -54, -80, -90,  // 16
+-83, -61, -25,  13,  50,  78,  90,  85, -83, -61, -25,  13,  50,  78,  90,  85,
+ 64,  31,  -9, -46, -75, -90, -87, -67,  64,  31,  -9, -46, -75, -90, -87, -67,
+-36,   4,  43,  73,  89,  88,  70,  38, -36,   4,  43,  73,  89,  88,  70,  38,
+ 64,  78,  43,  -4, -50, -82, -90, -73,  64,  78,  43,  -4, -50, -82, -90, -73,
+-36,  13,  57,  85,  89,  67,  25, -22, -36,  13,  57,  85,  89,  67,  25, -22,
+-64, -88, -87, -61, -18,  31,  70,  90, -64, -88, -87, -61, -18,  31,  70,  90,
+ 83,  54,   9, -38, -75, -90, -80, -46,  83,  54,   9, -38, -75, -90, -80, -46,
+ 64,  73,  25, -31, -75, -90, -70, -22,  64,  73,  25, -31, -75, -90, -70, -22,  // 24
+ 36,  78,  90,  67,  18, -38, -80, -90,  36,  78,  90,  67,  18, -38, -80, -90,
+-64, -13,  43,  82,  89,  61,   9, -46, -64, -13,  43,  82,  89,  61,   9, -46,
+-83, -88, -57,  -4,  50,  85,  87,  54, -83, -88, -57,  -4,  50,  85,  87,  54,
+ 64,  67,   9, -54, -89, -78, -25,  38,  64,  67,   9, -54, -89, -78, -25,  38,
+ 83,  85,  43, -22, -75, -90, -57,   4,  83,  85,  43, -22, -75, -90, -57,   4,
+ 64,  90,  70,  13, -50, -88, -80, -31,  64,  90,  70,  13, -50, -88, -80, -31,
+ 36,  82,  87,  46, -18, -73, -90, -61,  36,  82,  87,  46, -18, -73, -90, -61,
+ 64,  61,  -9, -73, -89, -46,  25,  82,  64,  61,  -9, -73, -89, -46,  25,  82,  // 32
+ 83,  31, -43, -88, -75, -13,  57,  90,  83,  31, -43, -88, -75, -13,  57,  90,
+ 64,  -4, -70, -90, -50,  22,  80,  85,  64,  -4, -70, -90, -50,  22,  80,  85,
+ 36, -38, -87, -78, -18,  54,  90,  67,  36, -38, -87, -78, -18,  54,  90,  67,
+ 64,  54, -25, -85, -75,  -4,  70,  88,  64,  54, -25, -85, -75,  -4,  70,  88,
+ 36, -46, -90, -61,  18,  82,  80,  13,  36, -46, -90, -61,  18,  82,  80,  13,
+-64, -90, -43,  38,  89,  67,  -9, -78, -64, -90, -43,  38,  89,  67,  -9, -78,
+-83, -22,  57,  90,  50, -31, -87, -73, -83, -22,  57,  90,  50, -31, -87, -73,
+ 64,  46, -43, -90, -50,  38,  90,  54,  64,  46, -43, -90, -50,  38,  90,  54,  // 40
+-36, -90, -57,  31,  89,  61, -25, -88, -36, -90, -57,  31,  89,  61, -25, -88,
+-64,  22,  87,  67, -18, -85, -70,  13, -64,  22,  87,  67, -18, -85, -70,  13,
+ 83,  73,  -9, -82, -75,   4,  80,  78,  83,  73,  -9, -82, -75,   4,  80,  78,
+ 64,  38, -57, -88, -18,  73,  80,  -4,  64,  38, -57, -88, -18,  73,  80,  -4,
+-83, -67,  25,  90,  50, -46, -90, -31, -83, -67,  25,  90,  50, -46, -90, -31,
+ 64,  85,   9, -78, -75,  13,  87,  61,  64,  85,   9, -78, -75,  13,  87,  61,
+-36, -90, -43,  54,  89,  22, -70, -82, -36, -90, -43,  54,  89,  22, -70, -82,
+ 64,  31, -70, -78,  18,  90,  43, -61,  64,  31, -70, -78,  18,  90,  43, -61,  // 48
+-83,   4,  87,  54, -50, -88,  -9,  82, -83,   4,  87,  54, -50, -88,  -9,  82,
+ 64, -38, -90, -22,  75,  73, -25, -90,  64, -38, -90, -22,  75,  73, -25, -90,
+-36,  67,  80, -13, -89, -46,  57,  85, -36,  67,  80, -13, -89, -46,  57,  85,
+ 64,  22, -80, -61,  50,  85,  -9, -90,  64,  22, -80, -61,  50,  85,  -9, -90,
+-36,  73,  70, -38, -89,  -4,  87,  46, -36,  73,  70, -38, -89,  -4,  87,  46,
+-64, -78,  25,  90,  18, -82, -57,  54, -64, -78,  25,  90,  18, -82, -57,  54,
+ 83, -13, -90, -31,  75,  67, -43, -88,  83, -13, -90, -31,  75,  67, -43, -88,
+ 64,  13, -87, -38,  75,  61, -57, -78,  64,  13, -87, -38,  75,  61, -57, -78,  // 56
+ 36,  88,  -9, -90, -18,  85,  43, -73,  36,  88,  -9, -90, -18,  85,  43, -73,
+-64,  54,  80, -31, -89,   4,  90,  22, -64,  54,  80, -31, -89,   4,  90,  22,
+-83, -46,  70,  67, -50, -82,  25,  90, -83, -46,  70,  67, -50, -82,  25,  90,
+ 64,   4, -90, -13,  89,  22, -87, -31,  64,   4, -90, -13,  89,  22, -87, -31,
+ 83,  38, -80, -46,  75,  54, -70, -61,  83,  38, -80, -46,  75,  54, -70, -61,
+ 64,  67, -57, -73,  50,  78, -43, -82,  64,  67, -57, -73,  50,  78, -43, -82,
+ 36,  85, -25, -88,  18,  90,  -9, -90,  36,  85, -25, -88,  18,  90,  -9, -90,
+ 64,  -4, -90,  13,  89, -22, -87,  31,  64,  -4, -90,  13,  89, -22, -87,  31,  // 64
+ 83, -38, -80,  46,  75, -54, -70,  61,  83, -38, -80,  46,  75, -54, -70,  61,
+ 64, -67, -57,  73,  50, -78, -43,  82,  64, -67, -57,  73,  50, -78, -43,  82,
+ 36, -85, -25,  88,  18, -90,  -9,  90,  36, -85, -25,  88,  18, -90,  -9,  90,
+ 64, -13, -87,  38,  75, -61, -57,  78,  64, -13, -87,  38,  75, -61, -57,  78,
+ 36, -88,  -9,  90, -18, -85,  43,  73,  36, -88,  -9,  90, -18, -85,  43,  73,
+-64, -54,  80,  31, -89,  -4,  90, -22, -64, -54,  80,  31, -89,  -4,  90, -22,
+-83,  46,  70, -67, -50,  82,  25, -90, -83,  46,  70, -67, -50,  82,  25, -90,
+ 64, -22, -80,  61,  50, -85,  -9,  90,  64, -22, -80,  61,  50, -85,  -9,  90,  // 72
+-36, -73,  70,  38, -89,   4,  87, -46, -36, -73,  70,  38, -89,   4,  87, -46,
+-64,  78,  25, -90,  18,  82, -57, -54, -64,  78,  25, -90,  18,  82, -57, -54,
+ 83,  13, -90,  31,  75, -67, -43,  88,  83,  13, -90,  31,  75, -67, -43,  88,
+ 64, -31, -70,  78,  18, -90,  43,  61,  64, -31, -70,  78,  18, -90,  43,  61,
+-83,  -4,  87, -54, -50,  88,  -9, -82, -83,  -4,  87, -54, -50,  88,  -9, -82,
+ 64,  38, -90,  22,  75, -73, -25,  90,  64,  38, -90,  22,  75, -73, -25,  90,
+-36, -67,  80,  13, -89,  46,  57, -85, -36, -67,  80,  13, -89,  46,  57, -85,
+ 64, -38, -57,  88, -18, -73,  80,   4,  64, -38, -57,  88, -18, -73,  80,   4,  // 80
+-83,  67,  25, -90,  50,  46, -90,  31, -83,  67,  25, -90,  50,  46, -90,  31,
+ 64, -85,   9,  78, -75, -13,  87, -61,  64, -85,   9,  78, -75, -13,  87, -61,
+-36,  90, -43, -54,  89, -22, -70,  82, -36,  90, -43, -54,  89, -22, -70,  82,
+ 64, -46, -43,  90, -50, -38,  90, -54,  64, -46, -43,  90, -50, -38,  90, -54,
+-36,  90, -57, -31,  89, -61, -25,  88, -36,  90, -57, -31,  89, -61, -25,  88,
+-64, -22,  87, -67, -18,  85, -70, -13, -64, -22,  87, -67, -18,  85, -70, -13,
+ 83, -73,  -9,  82, -75,  -4,  80, -78,  83, -73,  -9,  82, -75,  -4,  80, -78,
+ 64, -54, -25,  85, -75,   4,  70, -88,  64, -54, -25,  85, -75,   4,  70, -88,  // 88
+ 36,  46, -90,  61,  18, -82,  80, -13,  36,  46, -90,  61,  18, -82,  80, -13,
+-64,  90, -43, -38,  89, -67,  -9,  78, -64,  90, -43, -38,  89, -67,  -9,  78,
+-83,  22,  57, -90,  50,  31, -87,  73, -83,  22,  57, -90,  50,  31, -87,  73,
+ 64, -61,  -9,  73, -89,  46,  25, -82,  64, -61,  -9,  73, -89,  46,  25, -82,
+ 83, -31, -43,  88, -75,  13,  57, -90,  83, -31, -43,  88, -75,  13,  57, -90,
+ 64,   4, -70,  90, -50, -22,  80, -85,  64,   4, -70,  90, -50, -22,  80, -85,
+ 36,  38, -87,  78, -18, -54,  90, -67,  36,  38, -87,  78, -18, -54,  90, -67,
+ 64, -67,   9,  54, -89,  78, -25, -38,  64, -67,   9,  54, -89,  78, -25, -38,  // 96
+ 83, -85,  43,  22, -75,  90, -57,  -4,  83, -85,  43,  22, -75,  90, -57,  -4,
+ 64, -90,  70, -13, -50,  88, -80,  31,  64, -90,  70, -13, -50,  88, -80,  31,
+ 36, -82,  87, -46, -18,  73, -90,  61,  36, -82,  87, -46, -18,  73, -90,  61,
+ 64, -73,  25,  31, -75,  90, -70,  22,  64, -73,  25,  31, -75,  90, -70,  22,
+ 36, -78,  90, -67,  18,  38, -80,  90,  36, -78,  90, -67,  18,  38, -80,  90,
+-64,  13,  43, -82,  89, -61,   9,  46, -64,  13,  43, -82,  89, -61,   9,  46,
+-83,  88, -57,   4,  50, -85,  87, -54, -83,  88, -57,   4,  50, -85,  87, -54,
+ 64, -78,  43,   4, -50,  82, -90,  73,  64, -78,  43,   4, -50,  82, -90,  73,  // 104
+-36, -13,  57, -85,  89, -67,  25,  22, -36, -13,  57, -85,  89, -67,  25,  22,
+-64,  88, -87,  61, -18, -31,  70, -90, -64,  88, -87,  61, -18, -31,  70, -90,
+ 83, -54,   9,  38, -75,  90, -80,  46,  83, -54,   9,  38, -75,  90, -80,  46,
+ 64, -82,  57, -22, -18,  54, -80,  90,  64, -82,  57, -22, -18,  54, -80,  90,
+-83,  61, -25, -13,  50, -78,  90, -85, -83,  61, -25, -13,  50, -78,  90, -85,
+ 64, -31,  -9,  46, -75,  90, -87,  67,  64, -31,  -9,  46, -75,  90, -87,  67,
+-36,  -4,  43, -73,  89, -88,  70, -38, -36,  -4,  43, -73,  89, -88,  70, -38,
+ 64, -85,  70, -46,  18,  13, -43,  67,  64, -85,  70, -46,  18,  13, -43,  67,  // 112
+-83,  90, -87,  73, -50,  22,   9, -38, -83,  90, -87,  73, -50,  22,   9, -38,
+ 64, -82,  90, -88,  75, -54,  25,   4,  64, -82,  90, -88,  75, -54,  25,   4,
+-36,  61, -80,  90, -89,  78, -57,  31, -36,  61, -80,  90, -89,  78, -57,  31,
+ 64, -88,  80, -67,  50, -31,   9,  13,  64, -88,  80, -67,  50, -31,   9,  13,
+-36,  54, -70,  82, -89,  90, -87,  78, -36,  54, -70,  82, -89,  90, -87,  78,
+-64,  46, -25,   4,  18, -38,  57, -73, -64,  46, -25,   4,  18, -38,  57, -73,
+ 83, -90,  90, -85,  75, -61,  43, -22,  83, -90,  90, -85,  75, -61,  43, -22,
+ 64, -90,  87, -82,  75, -67,  57, -46,  64, -90,  87, -82,  75, -67,  57, -46,  // 120
+ 36, -22,   9,   4, -18,  31, -43,  54,  36, -22,   9,   4, -18,  31, -43,  54,
+-64,  73, -80,  85, -89,  90, -90,  88, -64,  73, -80,  85, -89,  90, -90,  88,
+-83,  78, -70,  61, -50,  38, -25,  13, -83,  78, -70,  61, -50,  38, -25,  13,
+ 64, -90,  90, -90,  89, -88,  87, -85,  64, -90,  90, -90,  89, -88,  87, -85,
+ 83, -82,  80, -78,  75, -73,  70, -67,  83, -82,  80, -78,  75, -73,  70, -67,
+ 64, -61,  57, -54,  50, -46,  43, -38,  64, -61,  57, -54,  50, -46,  43, -38,
+ 36, -31,  25, -22,  18, -13,   9,  -4,  36, -31,  25, -22,  18, -13,   9,  -4,
+};
+
+
+// 4xN
+ALIGNED(32) static const int16_t  ff_dct2_4x8_coeff_ver[256] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 0
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 89,  75,  50,  18,  89,  75,  50,  18,  89,  75,  50,  18,  89,  75,  50,  18,
+-18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89,
+ 83,  36, -36, -83,  83,  36, -36, -83,  83,  36, -36, -83,  83,  36, -36, -83,
+-83, -36,  36,  83, -83, -36,  36,  83, -83, -36,  36,  83, -83, -36,  36,  83,
+ 75, -18, -89, -50,  75, -18, -89, -50,  75, -18, -89, -50,  75, -18, -89, -50,
+ 50,  89,  18, -75,  50,  89,  18, -75,  50,  89,  18, -75,  50,  89,  18, -75,
+ 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  // 8
+ 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
+ 50, -89,  18,  75,  50, -89,  18,  75,  50, -89,  18,  75,  50, -89,  18,  75,
+-75, -18,  89, -50, -75, -18,  89, -50, -75, -18,  89, -50, -75, -18,  89, -50,
+ 36, -83,  83, -36,  36, -83,  83, -36,  36, -83,  83, -36,  36, -83,  83, -36,
+-36,  83, -83,  36, -36,  83, -83,  36, -36,  83, -83,  36, -36,  83, -83,  36,
+ 18, -50,  75, -89,  18, -50,  75, -89,  18, -50,  75, -89,  18, -50,  75, -89,
+ 89, -75,  50, -18,  89, -75,  50, -18,  89, -75,  50, -18,  89, -75,  50, -18,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_4x8_coeff_ver[256] = {
+ 17,  32,  46,  60,  17,  32,  46,  60,  17,  32,  46,  60,  17,  32,  46,  60,  // 0
+ 71,  78,  85,  86,  71,  78,  85,  86,  71,  78,  85,  86,  71,  78,  85,  86,
+ 46,  78,  86,  71,  46,  78,  86,  71,  46,  78,  86,  71,  46,  78,  86,  71,
+ 32, -17, -60, -85,  32, -17, -60, -85,  32, -17, -60, -85,  32, -17, -60, -85,
+ 71,  85,  32, -46,  71,  85,  32, -46,  71,  85,  32, -46,  71,  85,  32, -46,
+-86, -60,  17,  78, -86, -60,  17,  78, -86, -60,  17,  78, -86, -60,  17,  78,
+ 85,  46, -60, -78,  85,  46, -60, -78,  85,  46, -60, -78,  85,  46, -60, -78,
+ 17,  86,  32, -71,  17,  86,  32, -71,  17,  86,  32, -71,  17,  86,  32, -71,
+ 86, -17, -85,  32,  86, -17, -85,  32,  86, -17, -85,  32,  86, -17, -85,  32,  // 8
+ 78, -46, -71,  60,  78, -46, -71,  60,  78, -46, -71,  60,  78, -46, -71,  60,
+ 78, -71, -17,  85,  78, -71, -17,  85,  78, -71, -17,  85,  78, -71, -17,  85,
+-60, -32,  86, -46, -60, -32,  86, -46, -60, -32,  86, -46, -60, -32,  86, -46,
+ 60, -86,  71, -17,  60, -86,  71, -17,  60, -86,  71, -17,  60, -86,  71, -17,
+-46,  85, -78,  32, -46,  85, -78,  32, -46,  85, -78,  32, -46,  85, -78,  32,
+ 32, -60,  78, -86,  32, -60,  78, -86,  32, -60,  78, -86,  32, -60,  78, -86,
+ 85, -71,  46, -17,  85, -71,  46, -17,  85, -71,  46, -17,  85, -71,  46, -17,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_4x8_coeff_ver[256] = {
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_4xN_coeff_hor[64] = {
+ 64,  83,  64,  36,  64,  83,  64,  36,  64,  83,  64,  36,  64,  83,  64,  36,
+ 64,  36, -64, -83,  64,  36, -64, -83,  64,  36, -64, -83,  64,  36, -64, -83,
+ 64, -36, -64,  83,  64, -36, -64,  83,  64, -36, -64,  83,  64, -36, -64,  83,
+ 64, -83,  64, -36,  64, -83,  64, -36,  64, -83,  64, -36,  64, -83,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_4xN_coeff_hor[64] = {
+ 29,  74,  84,  55,  29,  74,  84,  55,  29,  74,  84,  55,  29,  74,  84,  55,
+ 55,  74, -29, -84,  55,  74, -29, -84,  55,  74, -29, -84,  55,  74, -29, -84,
+ 74,   0, -74,  74,  74,   0, -74,  74,  74,   0, -74,  74,  74,   0, -74,  74,
+ 84, -74,  55, -29,  84, -74,  55, -29,  84, -74,  55, -29,  84, -74,  55, -29,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 84,  74,  55,  29,  84,  74,  55,  29,  84,  74,  55,  29,  84,  74,  55,  29,
+ 74,   0, -74, -74,  74,   0, -74, -74,  74,   0, -74, -74,  74,   0, -74, -74,
+ 55, -74, -29,  84,  55, -74, -29,  84,  55, -74, -29,  84,  55, -74, -29,  84,
+ 29, -74,  84, -55,  29, -74,  84, -55,  29, -74,  84, -55,  29, -74,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_4x8_coeff_hor[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+-36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83,
+-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_4x8_coeff_hor[128] = {
+ 29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+ 74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,
+ 74, -84,  74, -84,  74, -84,  74, -84,  74, -84,  74, -84,  74, -84,  74, -84,
+  0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,
+-74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_4x8_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+ 29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,  29,  84,
+ 74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+-74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84,
+-74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_4x8_coeff_ver[256] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
+ 64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
+ 64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
+-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,
+-64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  // 8
+ 64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,
+ 64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,
+-64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,
+-64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_4x8_coeff_ver[256] = {
+ 17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
+ 86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
+ 32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
+-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60,
+ 46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,
+-85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78,
+ 60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,
+ 32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,
+ 71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  // 8
+ 78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,
+ 78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,
+-46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71,
+ 85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,
+-71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46,
+ 86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,
+ 60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_4x8_coeff_ver[256] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_4x16_coeff_hor[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_4x16_coeff_hor[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_4x16_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_4x16_coeff_ver[512] = {
+ 64,  90,  89,  87,  83,  80,  75,  70,  64,  90,  89,  87,  83,  80,  75,  70,  // 0
+ 64,  57,  50,  43,  36,  25,  18,   9,  64,  57,  50,  43,  36,  25,  18,   9,
+ 64,  87,  75,  57,  36,   9, -18, -43,  64,  87,  75,  57,  36,   9, -18, -43,
+-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25,
+ 64,  80,  50,   9, -36, -70, -89, -87,  64,  80,  50,   9, -36, -70, -89, -87,
+-64, -25,  18,  57,  83,  90,  75,  43, -64, -25,  18,  57,  83,  90,  75,  43,
+ 64,  70,  18, -43, -83, -87, -50,   9,  64,  70,  18, -43, -83, -87, -50,   9,
+ 64,  90,  75,  25, -36, -80, -89, -57,  64,  90,  75,  25, -36, -80, -89, -57,
+ 64,  57, -18, -80, -83, -25,  50,  90,  64,  57, -18, -80, -83, -25,  50,  90,  // 8
+ 64,  -9, -75, -87, -36,  43,  89,  70,  64,  -9, -75, -87, -36,  43,  89,  70,
+ 64,  43, -50, -90, -36,  57,  89,  25,  64,  43, -50, -90, -36,  57,  89,  25,
+-64, -87, -18,  70,  83,   9, -75, -80, -64, -87, -18,  70,  83,   9, -75, -80,
+ 64,  25, -75, -70,  36,  90,  18, -80,  64,  25, -75, -70,  36,  90,  18, -80,
+-64,  43,  89,   9, -83, -57,  50,  87, -64,  43,  89,   9, -83, -57,  50,  87,
+ 64,   9, -89, -25,  83,  43, -75, -57,  64,   9, -89, -25,  83,  43, -75, -57,
+ 64,  70, -50, -80,  36,  87, -18, -90,  64,  70, -50, -80,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  83, -43, -75,  57,  64,  -9, -89,  25,  83, -43, -75,  57,  // 16
+ 64, -70, -50,  80,  36, -87, -18,  90,  64, -70, -50,  80,  36, -87, -18,  90,
+ 64, -25, -75,  70,  36, -90,  18,  80,  64, -25, -75,  70,  36, -90,  18,  80,
+-64, -43,  89,  -9, -83,  57,  50, -87, -64, -43,  89,  -9, -83,  57,  50, -87,
+ 64, -43, -50,  90, -36, -57,  89, -25,  64, -43, -50,  90, -36, -57,  89, -25,
+-64,  87, -18, -70,  83,  -9, -75,  80, -64,  87, -18, -70,  83,  -9, -75,  80,
+ 64, -57, -18,  80, -83,  25,  50, -90,  64, -57, -18,  80, -83,  25,  50, -90,
+ 64,   9, -75,  87, -36, -43,  89, -70,  64,   9, -75,  87, -36, -43,  89, -70,
+ 64, -70,  18,  43, -83,  87, -50,  -9,  64, -70,  18,  43, -83,  87, -50,  -9,  // 24
+ 64, -90,  75, -25, -36,  80, -89,  57,  64, -90,  75, -25, -36,  80, -89,  57,
+ 64, -80,  50,  -9, -36,  70, -89,  87,  64, -80,  50,  -9, -36,  70, -89,  87,
+-64,  25,  18, -57,  83, -90,  75, -43, -64,  25,  18, -57,  83, -90,  75, -43,
+ 64, -87,  75, -57,  36,  -9, -18,  43,  64, -87,  75, -57,  36,  -9, -18,  43,
+-64,  80, -89,  90, -83,  70, -50,  25, -64,  80, -89,  90, -83,  70, -50,  25,
+ 64, -90,  89, -87,  83, -80,  75, -70,  64, -90,  89, -87,  83, -80,  75, -70,
+ 64, -57,  50, -43,  36, -25,  18,  -9,  64, -57,  50, -43,  36, -25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_4x16_coeff_ver[512] = {
+  8,  25,  40,  55,  68,  77,  85,  88,   8,  25,  40,  55,  68,  77,  85,  88,  // 0
+ 88,  87,  81,  73,  62,  48,  33,  17,  88,  87,  81,  73,  62,  48,  33,  17,
+ 17,  48,  73,  87,  88,  77,  55,  25,  17,  48,  73,  87,  88,  77,  55,  25,
+ -8, -40, -68, -85, -88, -81, -62, -33,  -8, -40, -68, -85, -88, -81, -62, -33,
+ 25,  68,  88,  81,  48,   0, -48, -81,  25,  68,  88,  81,  48,   0, -48, -81,
+-88, -68, -25,  25,  68,  88,  81,  48, -88, -68, -25,  25,  68,  88,  81,  48,
+ 33,  81,  85,  40, -25, -77, -87, -48,  33,  81,  85,  40, -25, -77, -87, -48,
+ 17,  73,  88,  55,  -8, -68, -88, -62,  17,  73,  88,  55,  -8, -68, -88, -62,
+ 40,  88,  62, -17, -81, -77,  -8,  68,  40,  88,  62, -17, -81, -77,  -8,  68,  // 8
+ 87,  33, -48, -88, -55,  25,  85,  73,  87,  33, -48, -88, -55,  25,  85,  73,
+ 48,  88,  25, -68, -81,   0,  81,  68,  48,  88,  25, -68, -81,   0,  81,  68,
+-25, -88, -48,  48,  88,  25, -68, -81, -25, -88, -48,  48,  88,  25, -68, -81,
+ 55,  81, -17, -88, -25,  77,  62, -48,  55,  81, -17, -88, -25,  77,  62, -48,
+-85,   8,  88,  33, -73, -68,  40,  87, -85,   8,  88,  33, -73, -68,  40,  87,
+ 62,  68, -55, -73,  48,  77, -40, -81,  62,  68, -55, -73,  48,  77, -40, -81,
+ 33,  85, -25, -87,  17,  88,  -8, -88,  33,  85, -25, -87,  17,  88,  -8, -88,
+ 68,  48, -81, -25,  88,   0, -88,  25,  68,  48, -81, -25,  88,   0, -88,  25,  // 16
+ 81, -48, -68,  68,  48, -81, -25,  88,  81, -48, -68,  68,  48, -81, -25,  88,
+ 73,  25, -88,  33,  68, -77, -17,  88,  73,  25, -88,  33,  68, -77, -17,  88,
+-40, -62,  81,   8, -87,  48,  55, -85, -40, -62,  81,   8, -87,  48,  55, -85,
+ 77,   0, -77,  77,   0, -77,  77,   0,  77,   0, -77,  77,   0, -77,  77,   0,
+-77,  77,   0, -77,  77,   0, -77,  77, -77,  77,   0, -77,  77,   0, -77,  77,
+ 81, -25, -48,  88, -68,   0,  68, -88,  81, -25, -48,  88, -68,   0,  68, -88,
+ 48,  25, -81,  81, -25, -48,  88, -68,  48,  25, -81,  81, -25, -48,  88, -68,
+ 85, -48,  -8,  62, -88,  77, -33, -25,  85, -48,  -8,  62, -88,  77, -33, -25,  // 24
+ 73, -88,  68, -17, -40,  81, -87,  55,  73, -88,  68, -17, -40,  81, -87,  55,
+ 87, -68,  33,   8, -48,  77, -88,  81,  87, -68,  33,   8, -48,  77, -88,  81,
+-55,  17,  25, -62,  85, -88,  73, -40, -55,  17,  25, -62,  85, -88,  73, -40,
+ 88, -81,  68, -48,  25,   0, -25,  48,  88, -81,  68, -48,  25,   0, -25,  48,
+-68,  81, -88,  88, -81,  68, -48,  25, -68,  81, -88,  88, -81,  68, -48,  25,
+ 88, -88,  87, -85,  81, -77,  73, -68,  88, -88,  87, -85,  81, -77,  73, -68,
+ 62, -55,  48, -40,  33, -25,  17,  -8,  62, -55,  48, -40,  33, -25,  17,  -8,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_4x16_coeff_ver[512] = {
+ 88,  88,  87,  85,  81,  77,  73,  68,  88,  88,  87,  85,  81,  77,  73,  68,  // 0
+ 62,  55,  48,  40,  33,  25,  17,   8,  62,  55,  48,  40,  33,  25,  17,   8,
+ 88,  81,  68,  48,  25,   0, -25, -48,  88,  81,  68,  48,  25,   0, -25, -48,
+-68, -81, -88, -88, -81, -68, -48, -25, -68, -81, -88, -88, -81, -68, -48, -25,
+ 87,  68,  33,  -8, -48, -77, -88, -81,  87,  68,  33,  -8, -48, -77, -88, -81,
+-55, -17,  25,  62,  85,  88,  73,  40, -55, -17,  25,  62,  85,  88,  73,  40,
+ 85,  48,  -8, -62, -88, -77, -33,  25,  85,  48,  -8, -62, -88, -77, -33,  25,
+ 73,  88,  68,  17, -40, -81, -87, -55,  73,  88,  68,  17, -40, -81, -87, -55,
+ 81,  25, -48, -88, -68,   0,  68,  88,  81,  25, -48, -88, -68,   0,  68,  88,  // 8
+ 48, -25, -81, -81, -25,  48,  88,  68,  48, -25, -81, -81, -25,  48,  88,  68,
+ 77,   0, -77, -77,   0,  77,  77,   0,  77,   0, -77, -77,   0,  77,  77,   0,
+-77, -77,   0,  77,  77,   0, -77, -77, -77, -77,   0,  77,  77,   0, -77, -77,
+ 73, -25, -88, -33,  68,  77, -17, -88,  73, -25, -88, -33,  68,  77, -17, -88,
+-40,  62,  81,  -8, -87, -48,  55,  85, -40,  62,  81,  -8, -87, -48,  55,  85,
+ 68, -48, -81,  25,  88,   0, -88, -25,  68, -48, -81,  25,  88,   0, -88, -25,
+ 81,  48, -68, -68,  48,  81, -25, -88,  81,  48, -68, -68,  48,  81, -25, -88,
+ 62, -68, -55,  73,  48, -77, -40,  81,  62, -68, -55,  73,  48, -77, -40,  81,  // 16
+ 33, -85, -25,  87,  17, -88,  -8,  88,  33, -85, -25,  87,  17, -88,  -8,  88,
+ 55, -81, -17,  88, -25, -77,  62,  48,  55, -81, -17,  88, -25, -77,  62,  48,
+-85,  -8,  88, -33, -73,  68,  40, -87, -85,  -8,  88, -33, -73,  68,  40, -87,
+ 48, -88,  25,  68, -81,   0,  81, -68,  48, -88,  25,  68, -81,   0,  81, -68,
+-25,  88, -48, -48,  88, -25, -68,  81, -25,  88, -48, -48,  88, -25, -68,  81,
+ 40, -88,  62,  17, -81,  77,  -8, -68,  40, -88,  62,  17, -81,  77,  -8, -68,
+ 87, -33, -48,  88, -55, -25,  85, -73,  87, -33, -48,  88, -55, -25,  85, -73,
+ 33, -81,  85, -40, -25,  77, -87,  48,  33, -81,  85, -40, -25,  77, -87,  48,  // 24
+ 17, -73,  88, -55,  -8,  68, -88,  62,  17, -73,  88, -55,  -8,  68, -88,  62,
+ 25, -68,  88, -81,  48,   0, -48,  81,  25, -68,  88, -81,  48,   0, -48,  81,
+-88,  68, -25, -25,  68, -88,  81, -48, -88,  68, -25, -25,  68, -88,  81, -48,
+ 17, -48,  73, -87,  88, -77,  55, -25,  17, -48,  73, -87,  88, -77,  55, -25,
+ -8,  40, -68,  85, -88,  81, -62,  33,  -8,  40, -68,  85, -88,  81, -62,  33,
+  8, -25,  40, -55,  68, -77,  85, -88,   8, -25,  40, -55,  68, -77,  85, -88,
+ 88, -87,  81, -73,  62, -48,  33, -17,  88, -87,  81, -73,  62, -48,  33, -17,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_4x32_coeff_hor[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_4x32_coeff_hor[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_4x32_coeff_hor[128] = { 
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+// 8xN
+ALIGNED(32) static const int16_t  ff_dct2_8xN_coeff_hor[128] = {
+ 64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
+ 64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
+ 64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
+ 64,  64, -75, -89,  36,  83,  18, -75,  64,  64, -75, -89,  36,  83,  18, -75,
+ 64, -64,  50, -89,  36, -83,  18, -50,  64, -64,  50, -89,  36, -83,  18, -50,
+-64,  64,  18,  75,  83, -36,  75, -89, -64,  64,  18,  75,  83, -36,  75, -89,
+ 64, -64, -75, -18, -36,  83,  89, -75,  64, -64, -75, -18, -36,  83,  89, -75,
+-64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_8xN_coeff_hor[128] = {
+ 17,  32,  46,  78,  71,  85,  85,  46,  17,  32,  46,  78,  71,  85,  85,  46,
+ 46,  60,  86,  71,  32, -46, -60, -78,  46,  60,  86,  71,  32, -46, -60, -78,
+ 71,  78,  32, -17, -86, -60,  17,  86,  71,  78,  32, -17, -86, -60,  17,  86,
+ 85,  86, -60, -85,  17,  78,  32, -71,  85,  86, -60, -85,  17,  78,  32, -71,
+ 86, -17,  78, -71,  60, -86,  32, -60,  86, -17,  78, -71,  60, -86,  32, -60,
+-85,  32, -17,  85,  71, -17,  78, -86, -85,  32, -17,  85,  71, -17,  78, -86,
+ 78, -46, -60, -32, -46,  85,  85, -71,  78, -46, -60, -32, -46,  85,  85, -71,
+-71,  60,  86, -46, -78,  32,  46, -17, -71,  60,  86, -46, -78,  32,  46, -17,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_8xN_coeff_hor[128] = {
+ 86,  85,  85,  60,  78,  17,  71, -32,  86,  85,  85,  60,  78,  17,  71, -32,
+ 78,  71,  17, -32, -60, -86, -86, -17,  78,  71,  17, -32, -60, -86, -86, -17,
+ 60,  46, -71, -86, -46,  32,  78,  60,  60,  46, -71, -86, -46,  32,  78,  60,
+ 32,  17, -78, -46,  85,  71, -46, -85,  32,  17, -78, -46,  85,  71, -46, -85,
+ 60, -71,  46, -86,  32, -78,  17, -46,  60, -71,  46, -86,  32, -78,  17, -46,
+-46,  78,  32,  60,  85, -46,  71, -85, -46,  78,  32,  60,  85, -46,  71, -85,
+ 32, -85, -85,  17, -17,  71,  86, -78,  32, -85, -85,  17, -17,  71,  86, -78,
+-17,  86,  71, -78, -86,  60,  60, -32, -17,  86,  71, -78, -86,  60,  60, -32,
+};
+
+
+          static const int16_t* ff_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) static const int16_t  fi_dct2_8x2_coeff_hor[128] = {
+ 64,  89,  83,  75,  64,  50,  36,  18,  64,  89,  83,  75,  64,  50,  36,  18,
+ 64,  75,  36, -18, -64, -89, -83, -50,  64,  75,  36, -18, -64, -89, -83, -50,
+ 64,  50, -36, -89, -64,  18,  83,  75,  64,  50, -36, -89, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  75, -36, -89,  64,  18, -83, -50,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -75, -36,  89,  64, -18, -83,  50,  64, -75, -36,  89,
+ 64, -50, -36,  89, -64, -18,  83, -75,  64, -50, -36,  89, -64, -18,  83, -75,
+ 64, -75,  36,  18, -64,  89, -83,  50,  64, -75,  36,  18, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -50,  36, -18,  64, -89,  83, -75,  64, -50,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_8x2_coeff_hor[128] = {
+  17,  46,  71,  85,  86,  78,  60,  32, 17,  46,  71,  85,  86,  78,  60,  32,
+  32,  78,  85,  46, -17, -71, -86, -60, 32,  78,  85,  46, -17, -71, -86, -60,
+  46,  86,  32, -60, -85, -17,  71,  78, 46,  86,  32, -60, -85, -17,  71,  78,
+  60,  71, -46, -78,  32,  85, -17, -86, 60,  71, -46, -78,  32,  85, -17, -86,
+  71,  32, -86,  17,  78, -60, -46,  85, 71,  32, -86,  17,  78, -60, -46,  85,
+  78, -17, -60,  86, -46, -32,  85, -71, 78, -17, -60,  86, -46, -32,  85, -71,
+  85, -60,  17,  32, -71,  86, -78,  46, 85, -60,  17,  32, -71,  86, -78,  46,
+  86, -85,  78, -71,  60, -46,  32, -17, 86, -85,  78, -71,  60, -46,  32, -17,
+};
+
+          static const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) static const int16_t  ff_dct2_8x4_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_8x4_coeff_ver[128] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
+ 74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
+ 74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+ 84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
+-74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55,
+ 55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,
+ 74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_8x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_8x4_coeff_hor[256] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
+ 64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
+ 64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
+-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,
+-64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  // 8
+ 64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,
+ 64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,
+-64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,
+-64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_8x4_coeff_hor[256] = {
+ 17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
+ 86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
+ 32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
+-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60,
+ 46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,
+-85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78,
+ 60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,
+ 32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,
+ 71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  // 8
+ 78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,
+ 78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,
+-46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71,
+ 85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,
+-71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46,
+ 86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,
+ 60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_8x4_coeff_hor[256] = {
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_8x4_coeff_ver[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_8x4_coeff_ver[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+          static const int16_t* fi_dct8_8x4_coeff_ver      = ff_dct8_8x4_coeff_ver; // Duplicate table
+
+
+ALIGNED(32) static const int16_t  ff_dct2_8x8_coeff_ver[64] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  89,  50,  75,  18, -18, -75, -50, -89,
+ 83, -36,  36, -83, -83,  36, -36,  83,  75, -89, -18, -50,  50,  18,  89, -75,
+ 64, -64, -64,  64,  64, -64, -64,  64,  50,  18, -89,  75, -75,  89, -18, -50,
+ 36,  83, -83, -36, -36, -83,  83,  36,  18,  75, -50, -89,  89,  50, -75, -18,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_8x8_coeff_ver[64] = {
+ 17,  46,  32,  60,  71,  85,  78,  86,  46,  86,  78,  71,  32, -60, -17, -85,
+ 71,  32,  85, -46, -86,  17, -60,  78,  85, -60,  46, -78,  17,  32,  86, -71,
+ 86, -85, -17,  32,  78, -71, -46,  60,  78, -17, -71,  85, -60,  86, -32, -46,
+ 60,  71, -86, -17, -46, -78,  85,  32,  32,  78, -60, -86,  85,  46, -71, -17,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_8x8_coeff_ver[64] = {
+ 86,  78,  85,  71,  60,  32,  46,  17,  85,  17,  60, -32, -71, -78, -86, -46,
+ 78, -60,  17, -86, -46,  85,  32,  71,  71, -86, -32, -17,  78, -46,  60, -85,
+ 60, -46, -71,  78,  32, -17, -85,  86,  46,  32, -86,  60, -85,  71,  17, -78,
+ 32,  85, -78, -46, -17, -86,  71,  60,  17,  71, -46, -85,  86,  60, -78, -32,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_8x8_coeff_hor[512] = {
+ 64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  64,  89,  // 0
+ 83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,
+ 64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,
+ 36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,
+ 64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,
+ 36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,
+-64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89,
+-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50,
+ 64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  64,  50,  // 8
+-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89,
+-64,  18, -64,  18, -64,  18, -64,  18, -64,  18, -64,  18, -64,  18, -64,  18,
+ 83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,  83,  75,
+ 64,  18,  64,  18,  64,  18,  64,  18,  64,  18,  64,  18,  64,  18,  64,  18,
+-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50,
+ 64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,  64,  75,
+-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89,
+ 64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  64, -18,  // 16
+-83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50,
+ 64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,
+-36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89,
+ 64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,
+-36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89, -36,  89,
+-64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18,
+ 83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,
+ 64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  64, -75,  // 24
+ 36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,  36,  18,
+-64,  89, -64,  89, -64,  89, -64,  89, -64,  89, -64,  89, -64,  89, -64,  89,
+-83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50, -83,  50,
+ 64, -89,  64, -89,  64, -89,  64, -89,  64, -89,  64, -89,  64, -89,  64, -89,
+ 83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,  83, -75,
+ 64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,  64, -50,
+ 36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_8x8_coeff_hor[512] = {
+ 17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  17,  46,  // 0
+ 71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,
+ 86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,  86,  78,
+ 60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,
+ 32,  78,  32,  78,  32,  78,  32,  78,  32,  78,  32,  78,  32,  78,  32,  78,
+ 85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,
+-17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71,
+-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60,
+ 46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  46,  86,  // 8
+ 32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,
+-85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17,
+ 71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,
+ 60,  71,  60,  71,  60,  71,  60,  71,  60,  71,  60,  71,  60,  71,  60,  71,
+-46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78,
+ 32,  85,  32,  85,  32,  85,  32,  85,  32,  85,  32,  85,  32,  85,  32,  85,
+-17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86,
+ 71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  71,  32,  // 16
+-86,  17, -86,  17, -86,  17, -86,  17, -86,  17, -86,  17, -86,  17, -86,  17,
+ 78, -60,  78, -60,  78, -60,  78, -60,  78, -60,  78, -60,  78, -60,  78, -60,
+-46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85,
+ 78, -17,  78, -17,  78, -17,  78, -17,  78, -17,  78, -17,  78, -17,  78, -17,
+-60,  86, -60,  86, -60,  86, -60,  86, -60,  86, -60,  86, -60,  86, -60,  86,
+-46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32,
+ 85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,
+ 85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  85, -60,  // 24
+ 17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,
+-71,  86, -71,  86, -71,  86, -71,  86, -71,  86, -71,  86, -71,  86, -71,  86,
+-78,  46, -78,  46, -78,  46, -78,  46, -78,  46, -78,  46, -78,  46, -78,  46,
+ 86, -85,  86, -85,  86, -85,  86, -85,  86, -85,  86, -85,  86, -85,  86, -85,
+ 78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,
+ 60, -46,  60, -46,  60, -46,  60, -46,  60, -46,  60, -46,  60, -46,  60, -46,
+ 32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_8x8_coeff_hor[512] = {
+ 86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  // 0
+ 78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,
+ 60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,
+ 32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,
+ 85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,
+ 17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,
+-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86,
+-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46,
+ 78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  // 8
+-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86,
+-46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32,
+ 85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,
+ 71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,
+-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17,
+ 78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,
+-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85,
+ 60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  // 16
+-46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78,
+ 32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,
+-17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86,
+ 46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,
+ 32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,
+-85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17,
+ 71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,
+ 32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  // 24
+ 85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,
+-17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71,
+-86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60,
+ 17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,
+ 71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,
+ 86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,
+ 60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_8x16_coeff_ver[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
+ 64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
+ 64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
+-64,  64, -25,  90,  18,  75,  57,  25,  83, -36,  90, -80,  75, -89,  43, -57,
+ 64,  64,  57,  43, -18, -50, -80, -90, -83, -36, -25,  57,  50,  89,  90,  25,
+ 64, -64,  -9, -87, -75, -18, -87,  70, -36,  83,  43,   9,  89, -75,  70, -80,
+ 64,  64,  25,   9, -75, -89, -70, -25,  36,  83,  90,  43,  18, -75, -80, -57,
+-64,  64,  43,  70,  89, -50,   9, -80, -83,  36, -57,  87,  50, -18,  87, -90,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  83,  36, -43, -90, -75,  18,  57,  80,  // 8
+ 64, -64, -70, -43, -50,  89,  80,  -9,  36, -83, -87,  57, -18,  50,  90, -87,
+ 64,  64, -43, -57, -50, -18,  90,  80, -36, -83, -57,  25,  89,  50, -25, -90,
+-64,  64,  87,   9, -18, -75, -70,  87,  83, -36,  -9, -43, -75,  89,  80, -70,
+ 64,  64, -70, -80,  18,  50,  43,  -9, -83, -36,  87,  70, -50, -89,  -9,  87,
+ 64, -64, -90,  25,  75,  18, -25, -57, -36,  83,  80, -90, -89,  75,  57, -43,
+ 64,  64, -87, -90,  75,  89, -57, -87,  36,  83,  -9, -80, -18,  75,  43, -70,
+-64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_8x16_coeff_ver[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
+ 88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
+ 25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
+-88,  17, -68,  73, -25,  88,  25,  55,  68,  -8,  88, -68,  81, -88,  48, -62,
+ 40,  48,  88,  88,  62,  25, -17, -68, -81, -81, -77,   0,  -8,  81,  68,  68,
+ 87, -25,  33, -88, -48, -48, -88,  48, -55,  88,  25,  25,  85, -68,  73, -81,
+ 55,  62,  81,  68, -17, -55, -88, -73, -25,  48,  77,  77,  62, -40, -48, -81,
+-85,  33,   8,  85,  88, -25,  33, -87, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 68,  73,  48,  25, -81, -88, -25,  33,  88,  68,   0, -77, -88, -17,  25,  88,  // 8
+ 81, -40, -48, -62, -68,  81,  68,   8,  48, -87, -81,  48, -25,  55,  88, -85,
+ 77,  81,   0, -25, -77, -48,  77,  88,   0, -68, -77,   0,  77,  68,   0, -88,
+-77,  48,  77,  25,   0, -81, -77,  81,  77, -25,   0, -48, -77,  88,  77, -68,
+ 85,  87, -48, -68,  -8,  33,  62,   8, -88, -48,  77,  77, -33, -88, -25,  81,
+ 73, -55, -88,  17,  68,  25, -17, -62, -40,  85,  81, -88, -87,  73,  55, -40,
+ 88,  88, -81, -88,  68,  87, -48, -85,  25,  81,   0, -77, -25,  73,  48, -68,
+-68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_8x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+ALIGNED(32) static const int16_t  ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = {
+ 89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  // 0
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  // 8
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+};
+
+
+          static const int16_t* fi_dct2_8x16_coeff_hor = fi_dct2_8x8_coeff_hor;
+            
+          static const int16_t* fi_dst7_8x16_coeff_hor = fi_dst7_8x8_coeff_hor;
+            
+          static const int16_t* fi_dct8_8x16_coeff_hor = fi_dct8_8x8_coeff_hor;
+
+
+ALIGNED(32) static const int16_t  fi_dct2_8x16_coeff_ver[2048] = {
+ 64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  // 0
+ 89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,  89,  87,
+ 83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,  83,  80,
+ 75,  70,  75,  70,  75,  70,  75,  70,  75,  70,  75,  70,  75,  70,  75,  70,
+ 64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,
+ 50,  43,  50,  43,  50,  43,  50,  43,  50,  43,  50,  43,  50,  43,  50,  43,
+ 36,  25,  36,  25,  36,  25,  36,  25,  36,  25,  36,  25,  36,  25,  36,  25,
+ 18,   9,  18,   9,  18,   9,  18,   9,  18,   9,  18,   9,  18,   9,  18,   9,
+ 64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  64,  87,  // 8
+ 75,  57,  75,  57,  75,  57,  75,  57,  75,  57,  75,  57,  75,  57,  75,  57,
+ 36,   9,  36,   9,  36,   9,  36,   9,  36,   9,  36,   9,  36,   9,  36,   9,
+-18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43,
+-64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80,
+-89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90,
+-83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70,
+-50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25,
+ 64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  64,  80,  // 16
+ 50,   9,  50,   9,  50,   9,  50,   9,  50,   9,  50,   9,  50,   9,  50,   9,
+-36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70,
+-89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87,
+-64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25,
+ 18,  57,  18,  57,  18,  57,  18,  57,  18,  57,  18,  57,  18,  57,  18,  57,
+ 83,  90,  83,  90,  83,  90,  83,  90,  83,  90,  83,  90,  83,  90,  83,  90,
+ 75,  43,  75,  43,  75,  43,  75,  43,  75,  43,  75,  43,  75,  43,  75,  43,
+ 64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  // 24
+ 18, -43,  18, -43,  18, -43,  18, -43,  18, -43,  18, -43,  18, -43,  18, -43,
+-83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87,
+-50,   9, -50,   9, -50,   9, -50,   9, -50,   9, -50,   9, -50,   9, -50,   9,
+ 64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,  64,  90,
+ 75,  25,  75,  25,  75,  25,  75,  25,  75,  25,  75,  25,  75,  25,  75,  25,
+-36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80,
+-89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57,
+ 64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  64,  57,  // 32
+-18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80,
+-83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25,
+ 50,  90,  50,  90,  50,  90,  50,  90,  50,  90,  50,  90,  50,  90,  50,  90,
+ 64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,
+-75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87,
+-36,  43, -36,  43, -36,  43, -36,  43, -36,  43, -36,  43, -36,  43, -36,  43,
+ 89,  70,  89,  70,  89,  70,  89,  70,  89,  70,  89,  70,  89,  70,  89,  70,
+ 64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  64,  43,  // 40
+-50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90,
+-36,  57, -36,  57, -36,  57, -36,  57, -36,  57, -36,  57, -36,  57, -36,  57,
+ 89,  25,  89,  25,  89,  25,  89,  25,  89,  25,  89,  25,  89,  25,  89,  25,
+-64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87,
+-18,  70, -18,  70, -18,  70, -18,  70, -18,  70, -18,  70, -18,  70, -18,  70,
+ 83,   9,  83,   9,  83,   9,  83,   9,  83,   9,  83,   9,  83,   9,  83,   9,
+-75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80,
+ 64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  64,  25,  // 48
+-75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70,
+ 36,  90,  36,  90,  36,  90,  36,  90,  36,  90,  36,  90,  36,  90,  36,  90,
+ 18, -80,  18, -80,  18, -80,  18, -80,  18, -80,  18, -80,  18, -80,  18, -80,
+-64,  43, -64,  43, -64,  43, -64,  43, -64,  43, -64,  43, -64,  43, -64,  43,
+ 89,   9,  89,   9,  89,   9,  89,   9,  89,   9,  89,   9,  89,   9,  89,   9,
+-83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57,
+ 50,  87,  50,  87,  50,  87,  50,  87,  50,  87,  50,  87,  50,  87,  50,  87,
+ 64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  // 56
+-89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25,
+ 83,  43,  83,  43,  83,  43,  83,  43,  83,  43,  83,  43,  83,  43,  83,  43,
+-75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57,
+ 64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,  64,  70,
+-50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80,
+ 36,  87,  36,  87,  36,  87,  36,  87,  36,  87,  36,  87,  36,  87,  36,  87,
+-18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90,
+ 64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  64,  -9,  // 64
+-89,  25, -89,  25, -89,  25, -89,  25, -89,  25, -89,  25, -89,  25, -89,  25,
+ 83, -43,  83, -43,  83, -43,  83, -43,  83, -43,  83, -43,  83, -43,  83, -43,
+-75,  57, -75,  57, -75,  57, -75,  57, -75,  57, -75,  57, -75,  57, -75,  57,
+ 64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,
+-50,  80, -50,  80, -50,  80, -50,  80, -50,  80, -50,  80, -50,  80, -50,  80,
+ 36, -87,  36, -87,  36, -87,  36, -87,  36, -87,  36, -87,  36, -87,  36, -87,
+-18,  90, -18,  90, -18,  90, -18,  90, -18,  90, -18,  90, -18,  90, -18,  90,
+ 64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  64, -25,  // 72
+-75,  70, -75,  70, -75,  70, -75,  70, -75,  70, -75,  70, -75,  70, -75,  70,
+ 36, -90,  36, -90,  36, -90,  36, -90,  36, -90,  36, -90,  36, -90,  36, -90,
+ 18,  80,  18,  80,  18,  80,  18,  80,  18,  80,  18,  80,  18,  80,  18,  80,
+-64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43,
+ 89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,  89,  -9,
+-83,  57, -83,  57, -83,  57, -83,  57, -83,  57, -83,  57, -83,  57, -83,  57,
+ 50, -87,  50, -87,  50, -87,  50, -87,  50, -87,  50, -87,  50, -87,  50, -87,
+ 64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  64, -43,  // 80
+-50,  90, -50,  90, -50,  90, -50,  90, -50,  90, -50,  90, -50,  90, -50,  90,
+-36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57,
+ 89, -25,  89, -25,  89, -25,  89, -25,  89, -25,  89, -25,  89, -25,  89, -25,
+-64,  87, -64,  87, -64,  87, -64,  87, -64,  87, -64,  87, -64,  87, -64,  87,
+-18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70,
+ 83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,  83,  -9,
+-75,  80, -75,  80, -75,  80, -75,  80, -75,  80, -75,  80, -75,  80, -75,  80,
+ 64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  // 88
+-18,  80, -18,  80, -18,  80, -18,  80, -18,  80, -18,  80, -18,  80, -18,  80,
+-83,  25, -83,  25, -83,  25, -83,  25, -83,  25, -83,  25, -83,  25, -83,  25,
+ 50, -90,  50, -90,  50, -90,  50, -90,  50, -90,  50, -90,  50, -90,  50, -90,
+ 64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,  64,   9,
+-75,  87, -75,  87, -75,  87, -75,  87, -75,  87, -75,  87, -75,  87, -75,  87,
+-36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43,
+ 89, -70,  89, -70,  89, -70,  89, -70,  89, -70,  89, -70,  89, -70,  89, -70,
+ 64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  64, -70,  // 96
+ 18,  43,  18,  43,  18,  43,  18,  43,  18,  43,  18,  43,  18,  43,  18,  43,
+-83,  87, -83,  87, -83,  87, -83,  87, -83,  87, -83,  87, -83,  87, -83,  87,
+-50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9, -50,  -9,
+ 64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,
+ 75, -25,  75, -25,  75, -25,  75, -25,  75, -25,  75, -25,  75, -25,  75, -25,
+-36,  80, -36,  80, -36,  80, -36,  80, -36,  80, -36,  80, -36,  80, -36,  80,
+-89,  57, -89,  57, -89,  57, -89,  57, -89,  57, -89,  57, -89,  57, -89,  57,
+ 64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  64, -80,  // 104
+ 50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,  50,  -9,
+-36,  70, -36,  70, -36,  70, -36,  70, -36,  70, -36,  70, -36,  70, -36,  70,
+-89,  87, -89,  87, -89,  87, -89,  87, -89,  87, -89,  87, -89,  87, -89,  87,
+-64,  25, -64,  25, -64,  25, -64,  25, -64,  25, -64,  25, -64,  25, -64,  25,
+ 18, -57,  18, -57,  18, -57,  18, -57,  18, -57,  18, -57,  18, -57,  18, -57,
+ 83, -90,  83, -90,  83, -90,  83, -90,  83, -90,  83, -90,  83, -90,  83, -90,
+ 75, -43,  75, -43,  75, -43,  75, -43,  75, -43,  75, -43,  75, -43,  75, -43,
+ 64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  64, -87,  // 112
+ 75, -57,  75, -57,  75, -57,  75, -57,  75, -57,  75, -57,  75, -57,  75, -57,
+ 36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,  36,  -9,
+-18,  43, -18,  43, -18,  43, -18,  43, -18,  43, -18,  43, -18,  43, -18,  43,
+-64,  80, -64,  80, -64,  80, -64,  80, -64,  80, -64,  80, -64,  80, -64,  80,
+-89,  90, -89,  90, -89,  90, -89,  90, -89,  90, -89,  90, -89,  90, -89,  90,
+-83,  70, -83,  70, -83,  70, -83,  70, -83,  70, -83,  70, -83,  70, -83,  70,
+-50,  25, -50,  25, -50,  25, -50,  25, -50,  25, -50,  25, -50,  25, -50,  25,
+ 64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  64, -90,  // 120
+ 89, -87,  89, -87,  89, -87,  89, -87,  89, -87,  89, -87,  89, -87,  89, -87,
+ 83, -80,  83, -80,  83, -80,  83, -80,  83, -80,  83, -80,  83, -80,  83, -80,
+ 75, -70,  75, -70,  75, -70,  75, -70,  75, -70,  75, -70,  75, -70,  75, -70,
+ 64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,  64, -57,
+ 50, -43,  50, -43,  50, -43,  50, -43,  50, -43,  50, -43,  50, -43,  50, -43,
+ 36, -25,  36, -25,  36, -25,  36, -25,  36, -25,  36, -25,  36, -25,  36, -25,
+ 18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_8x16_coeff_ver[2048] = {
+  8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,   8,  25,  // 0
+ 40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,  40,  55,
+ 68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,
+ 85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,
+ 88,  87,  88,  87,  88,  87,  88,  87,  88,  87,  88,  87,  88,  87,  88,  87,
+ 81,  73,  81,  73,  81,  73,  81,  73,  81,  73,  81,  73,  81,  73,  81,  73,
+ 62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,
+ 33,  17,  33,  17,  33,  17,  33,  17,  33,  17,  33,  17,  33,  17,  33,  17,
+ 17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  17,  48,  // 8
+ 73,  87,  73,  87,  73,  87,  73,  87,  73,  87,  73,  87,  73,  87,  73,  87,
+ 88,  77,  88,  77,  88,  77,  88,  77,  88,  77,  88,  77,  88,  77,  88,  77,
+ 55,  25,  55,  25,  55,  25,  55,  25,  55,  25,  55,  25,  55,  25,  55,  25,
+ -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,  -8, -40,
+-68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85,
+-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81,
+-62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33,
+ 25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  // 16
+ 88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,
+ 48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,
+-48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81,
+-88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,
+ 81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,
+ 33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  33,  81,  // 24
+ 85,  40,  85,  40,  85,  40,  85,  40,  85,  40,  85,  40,  85,  40,  85,  40,
+-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77,
+-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48,
+ 17,  73,  17,  73,  17,  73,  17,  73,  17,  73,  17,  73,  17,  73,  17,  73,
+ 88,  55,  88,  55,  88,  55,  88,  55,  88,  55,  88,  55,  88,  55,  88,  55,
+ -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,
+-88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62,
+ 40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  40,  88,  // 32
+ 62, -17,  62, -17,  62, -17,  62, -17,  62, -17,  62, -17,  62, -17,  62, -17,
+-81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77,
+ -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,
+ 87,  33,  87,  33,  87,  33,  87,  33,  87,  33,  87,  33,  87,  33,  87,  33,
+-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88,
+-55,  25, -55,  25, -55,  25, -55,  25, -55,  25, -55,  25, -55,  25, -55,  25,
+ 85,  73,  85,  73,  85,  73,  85,  73,  85,  73,  85,  73,  85,  73,  85,  73,
+ 48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  48,  88,  // 40
+ 25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,
+-81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0,
+ 81,  68,  81,  68,  81,  68,  81,  68,  81,  68,  81,  68,  81,  68,  81,  68,
+-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88,
+-48,  48, -48,  48, -48,  48, -48,  48, -48,  48, -48,  48, -48,  48, -48,  48,
+ 88,  25,  88,  25,  88,  25,  88,  25,  88,  25,  88,  25,  88,  25,  88,  25,
+-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81,
+ 55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  55,  81,  // 48
+-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88,
+-25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77,
+ 62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,
+-85,   8, -85,   8, -85,   8, -85,   8, -85,   8, -85,   8, -85,   8, -85,   8,
+ 88,  33,  88,  33,  88,  33,  88,  33,  88,  33,  88,  33,  88,  33,  88,  33,
+-73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68,
+ 40,  87,  40,  87,  40,  87,  40,  87,  40,  87,  40,  87,  40,  87,  40,  87,
+ 62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  62,  68,  // 56
+-55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73,
+ 48,  77,  48,  77,  48,  77,  48,  77,  48,  77,  48,  77,  48,  77,  48,  77,
+-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81,
+ 33,  85,  33,  85,  33,  85,  33,  85,  33,  85,  33,  85,  33,  85,  33,  85,
+-25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87,
+ 17,  88,  17,  88,  17,  88,  17,  88,  17,  88,  17,  88,  17,  88,  17,  88,
+ -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,  -8, -88,
+ 68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  // 64
+-81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25,
+ 88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,
+-88,  25, -88,  25, -88,  25, -88,  25, -88,  25, -88,  25, -88,  25, -88,  25,
+ 81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,
+-68,  68, -68,  68, -68,  68, -68,  68, -68,  68, -68,  68, -68,  68, -68,  68,
+ 48, -81,  48, -81,  48, -81,  48, -81,  48, -81,  48, -81,  48, -81,  48, -81,
+-25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88,
+ 73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  73,  25,  // 72
+-88,  33, -88,  33, -88,  33, -88,  33, -88,  33, -88,  33, -88,  33, -88,  33,
+ 68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,
+-17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88,
+-40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62,
+ 81,   8,  81,   8,  81,   8,  81,   8,  81,   8,  81,   8,  81,   8,  81,   8,
+-87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48,
+ 55, -85,  55, -85,  55, -85,  55, -85,  55, -85,  55, -85,  55, -85,  55, -85,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  // 80
+-77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77,
+  0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77,
+  0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,   0, -77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77, -77,  77,
+ 81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  81, -25,  // 88
+-48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88,
+-68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0,
+ 68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,
+ 48,  25,  48,  25,  48,  25,  48,  25,  48,  25,  48,  25,  48,  25,  48,  25,
+-81,  81, -81,  81, -81,  81, -81,  81, -81,  81, -81,  81, -81,  81, -81,  81,
+-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48,
+ 88, -68,  88, -68,  88, -68,  88, -68,  88, -68,  88, -68,  88, -68,  88, -68,
+ 85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  85, -48,  // 96
+ -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,  -8,  62,
+-88,  77, -88,  77, -88,  77, -88,  77, -88,  77, -88,  77, -88,  77, -88,  77,
+-33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25,
+ 73, -88,  73, -88,  73, -88,  73, -88,  73, -88,  73, -88,  73, -88,  73, -88,
+ 68, -17,  68, -17,  68, -17,  68, -17,  68, -17,  68, -17,  68, -17,  68, -17,
+-40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81,
+-87,  55, -87,  55, -87,  55, -87,  55, -87,  55, -87,  55, -87,  55, -87,  55,
+ 87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  87, -68,  // 104
+ 33,   8,  33,   8,  33,   8,  33,   8,  33,   8,  33,   8,  33,   8,  33,   8,
+-48,  77, -48,  77, -48,  77, -48,  77, -48,  77, -48,  77, -48,  77, -48,  77,
+-88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81,
+-55,  17, -55,  17, -55,  17, -55,  17, -55,  17, -55,  17, -55,  17, -55,  17,
+ 25, -62,  25, -62,  25, -62,  25, -62,  25, -62,  25, -62,  25, -62,  25, -62,
+ 85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,
+ 73, -40,  73, -40,  73, -40,  73, -40,  73, -40,  73, -40,  73, -40,  73, -40,
+ 88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  // 112
+ 68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,
+ 25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,
+-25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48,
+-68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-81,  68, -81,  68, -81,  68, -81,  68, -81,  68, -81,  68, -81,  68, -81,  68,
+-48,  25, -48,  25, -48,  25, -48,  25, -48,  25, -48,  25, -48,  25, -48,  25,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  // 120
+ 87, -85,  87, -85,  87, -85,  87, -85,  87, -85,  87, -85,  87, -85,  87, -85,
+ 81, -77,  81, -77,  81, -77,  81, -77,  81, -77,  81, -77,  81, -77,  81, -77,
+ 73, -68,  73, -68,  73, -68,  73, -68,  73, -68,  73, -68,  73, -68,  73, -68,
+ 62, -55,  62, -55,  62, -55,  62, -55,  62, -55,  62, -55,  62, -55,  62, -55,
+ 48, -40,  48, -40,  48, -40,  48, -40,  48, -40,  48, -40,  48, -40,  48, -40,
+ 33, -25,  33, -25,  33, -25,  33, -25,  33, -25,  33, -25,  33, -25,  33, -25,
+ 17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,  17,  -8,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_8x16_coeff_ver[2048] = {
+ 88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  // 0
+ 87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,  87,  85,
+ 81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,  81,  77,
+ 73,  68,  73,  68,  73,  68,  73,  68,  73,  68,  73,  68,  73,  68,  73,  68,
+ 62,  55,  62,  55,  62,  55,  62,  55,  62,  55,  62,  55,  62,  55,  62,  55,
+ 48,  40,  48,  40,  48,  40,  48,  40,  48,  40,  48,  40,  48,  40,  48,  40,
+ 33,  25,  33,  25,  33,  25,  33,  25,  33,  25,  33,  25,  33,  25,  33,  25,
+ 17,   8,  17,   8,  17,   8,  17,   8,  17,   8,  17,   8,  17,   8,  17,   8,
+ 88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  88,  81,  // 8
+ 68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,  68,  48,
+ 25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,  25,   0,
+-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48,
+-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81,
+-88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88,
+-81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68,
+-48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25,
+ 87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  87,  68,  // 16
+ 33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,  33,  -8,
+-48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77,
+-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81,
+-55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17,
+ 25,  62,  25,  62,  25,  62,  25,  62,  25,  62,  25,  62,  25,  62,  25,  62,
+ 85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,  85,  88,
+ 73,  40,  73,  40,  73,  40,  73,  40,  73,  40,  73,  40,  73,  40,  73,  40,
+ 85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  85,  48,  // 24
+ -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,  -8, -62,
+-88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77,
+-33,  25, -33,  25, -33,  25, -33,  25, -33,  25, -33,  25, -33,  25, -33,  25,
+ 73,  88,  73,  88,  73,  88,  73,  88,  73,  88,  73,  88,  73,  88,  73,  88,
+ 68,  17,  68,  17,  68,  17,  68,  17,  68,  17,  68,  17,  68,  17,  68,  17,
+-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81,
+-87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55,
+ 81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  81,  25,  // 32
+-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88,
+-68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0, -68,   0,
+ 68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,
+ 48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,
+-81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81,
+-25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48, -25,  48,
+ 88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,  88,  68,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  // 40
+-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77,
+  0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77,
+  0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,
+ 77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,  77,   0,
+-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77,
+ 73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  73, -25,  // 48
+-88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33,
+ 68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,  68,  77,
+-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88,
+-40,  62, -40,  62, -40,  62, -40,  62, -40,  62, -40,  62, -40,  62, -40,  62,
+ 81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,  81,  -8,
+-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48,
+ 55,  85,  55,  85,  55,  85,  55,  85,  55,  85,  55,  85,  55,  85,  55,  85,
+ 68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  68, -48,  // 56
+-81,  25, -81,  25, -81,  25, -81,  25, -81,  25, -81,  25, -81,  25, -81,  25,
+ 88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,  88,   0,
+-88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25,
+ 81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,
+-68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+ 48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,  48,  81,
+-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88,
+ 62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  62, -68,  // 64
+-55,  73, -55,  73, -55,  73, -55,  73, -55,  73, -55,  73, -55,  73, -55,  73,
+ 48, -77,  48, -77,  48, -77,  48, -77,  48, -77,  48, -77,  48, -77,  48, -77,
+-40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81, -40,  81,
+ 33, -85,  33, -85,  33, -85,  33, -85,  33, -85,  33, -85,  33, -85,  33, -85,
+-25,  87, -25,  87, -25,  87, -25,  87, -25,  87, -25,  87, -25,  87, -25,  87,
+ 17, -88,  17, -88,  17, -88,  17, -88,  17, -88,  17, -88,  17, -88,  17, -88,
+ -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,  -8,  88,
+ 55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  55, -81,  // 72
+-17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88, -17,  88,
+-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77,
+ 62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,  62,  48,
+-85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8, -85,  -8,
+ 88, -33,  88, -33,  88, -33,  88, -33,  88, -33,  88, -33,  88, -33,  88, -33,
+-73,  68, -73,  68, -73,  68, -73,  68, -73,  68, -73,  68, -73,  68, -73,  68,
+ 40, -87,  40, -87,  40, -87,  40, -87,  40, -87,  40, -87,  40, -87,  40, -87,
+ 48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  48, -88,  // 80
+ 25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,  25,  68,
+-81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0, -81,   0,
+ 81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,
+-25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88,
+-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
+ 88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,  88, -25,
+-68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81, -68,  81,
+ 40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  40, -88,  // 88
+ 62,  17,  62,  17,  62,  17,  62,  17,  62,  17,  62,  17,  62,  17,  62,  17,
+-81,  77, -81,  77, -81,  77, -81,  77, -81,  77, -81,  77, -81,  77, -81,  77,
+ -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,  -8, -68,
+ 87, -33,  87, -33,  87, -33,  87, -33,  87, -33,  87, -33,  87, -33,  87, -33,
+-48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88, -48,  88,
+-55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25,
+ 85, -73,  85, -73,  85, -73,  85, -73,  85, -73,  85, -73,  85, -73,  85, -73,
+ 33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  33, -81,  // 96
+ 85, -40,  85, -40,  85, -40,  85, -40,  85, -40,  85, -40,  85, -40,  85, -40,
+-25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77, -25,  77,
+-87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48, -87,  48,
+ 17, -73,  17, -73,  17, -73,  17, -73,  17, -73,  17, -73,  17, -73,  17, -73,
+ 88, -55,  88, -55,  88, -55,  88, -55,  88, -55,  88, -55,  88, -55,  88, -55,
+ -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,  -8,  68,
+-88,  62, -88,  62, -88,  62, -88,  62, -88,  62, -88,  62, -88,  62, -88,  62,
+ 25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  25, -68,  // 104
+ 88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,  88, -81,
+ 48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,  48,   0,
+-48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81,
+-88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68,
+-25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25,
+ 68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,  68, -88,
+ 81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,  81, -48,
+ 17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  17, -48,  // 112
+ 73, -87,  73, -87,  73, -87,  73, -87,  73, -87,  73, -87,  73, -87,  73, -87,
+ 88, -77,  88, -77,  88, -77,  88, -77,  88, -77,  88, -77,  88, -77,  88, -77,
+ 55, -25,  55, -25,  55, -25,  55, -25,  55, -25,  55, -25,  55, -25,  55, -25,
+ -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,  -8,  40,
+-68,  85, -68,  85, -68,  85, -68,  85, -68,  85, -68,  85, -68,  85, -68,  85,
+-88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81, -88,  81,
+-62,  33, -62,  33, -62,  33, -62,  33, -62,  33, -62,  33, -62,  33, -62,  33,
+  8, -25,   8, -25,   8, -25,   8, -25,   8, -25,   8, -25,   8, -25,   8, -25,  // 120
+ 40, -55,  40, -55,  40, -55,  40, -55,  40, -55,  40, -55,  40, -55,  40, -55,
+ 68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,  68, -77,
+ 85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,  85, -88,
+ 88, -87,  88, -87,  88, -87,  88, -87,  88, -87,  88, -87,  88, -87,  88, -87,
+ 81, -73,  81, -73,  81, -73,  81, -73,  81, -73,  81, -73,  81, -73,  81, -73,
+ 62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,  62, -48,
+ 33, -17,  33, -17,  33, -17,  33, -17,  33, -17,  33, -17,  33, -17,  33, -17,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_8x32_coeff_ver[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_8x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_8x32_coeff_ver[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+
+          static const int16_t* fi_dct2_8x32_coeff_hor = fi_dct2_8x8_coeff_hor;
+
+          static const int16_t* fi_dst7_8x32_coeff_hor = fi_dst7_8x8_coeff_hor;
+
+          static const int16_t* fi_dct8_8x32_coeff_hor = fi_dct8_8x8_coeff_hor;
+
+
+// 16xN
+ALIGNED(32) static const int16_t  ff_dct2_16xN_coeff_hor[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  64, -64,  57, -80,  50, -89,  43, -90,
+ 64,  64,  80,  70,  50,  18,   9, -43, -64,  64, -25,  90,  18,  75,  57,  25,
+ 64,  64,  57,  43, -18, -50, -80, -90,  64, -64,  -9, -87, -75, -18, -87,  70,
+ 64,  64,  25,   9, -75, -89, -70, -25, -64,  64,  43,  70,  89, -50,   9, -80,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  64, -64, -70, -43, -50,  89,  80,  -9,
+ 64,  64, -43, -57, -50, -18,  90,  80, -64,  64,  87,   9, -18, -75, -70,  87,
+ 64,  64, -70, -80,  18,  50,  43,  -9,  64, -64, -90,  25,  75,  18, -25, -57,
+ 64,  64, -87, -90,  75,  89, -57, -87, -64,  64,  80, -57, -89,  50,  90, -43,
+ 83,  36,  80,   9,  75, -18,  70, -43,  36, -83,  25, -70,  18, -50,   9, -25,
+-36, -83, -70, -87, -89, -50, -87,   9,  83, -36,  90, -80,  75, -89,  43, -57,
+-83, -36, -25,  57,  50,  89,  90,  25, -36,  83,  43,   9,  89, -75,  70, -80,
+ 36,  83,  90,  43,  18, -75, -80, -57, -83,  36, -57,  87,  50, -18,  87, -90,
+ 83,  36, -43, -90, -75,  18,  57,  80,  36, -83, -87,  57, -18,  50,  90, -87,
+-36, -83, -57,  25,  89,  50, -25, -90,  83, -36,  -9, -43, -75,  89,  80, -70,
+-83, -36,  87,  70, -50, -89,  -9,  87, -36,  83,  80, -90, -89,  75,  57, -43,
+ 36,  83,  -9, -80, -18,  75,  43, -70, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_16xN_coeff_hor[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  88,  -8,  87, -40,  81, -68,  73, -85,  // 0
+ 25,  33,  68,  81,  88,  85,  81,  40, -88,  17, -68,  73, -25,  88,  25,  55,
+ 40,  48,  88,  88,  62,  25, -17, -68,  87, -25,  33, -88, -48, -48, -88,  48,
+ 55,  62,  81,  68, -17, -55, -88, -73, -85,  33,   8,  85,  88, -25,  33, -87,
+ 68,  73,  48,  25, -81, -88, -25,  33,  81, -40, -48, -62, -68,  81,  68,   8,
+ 77,  81,   0, -25, -77, -48,  77,  88, -77,  48,  77,  25,   0, -81, -77,  81,
+ 85,  87, -48, -68,  -8,  33,  62,   8,  73, -55, -88,  17,  68,  25, -17, -62,
+ 88,  88, -81, -88,  68,  87, -48, -85, -68,  62,  81, -55, -88,  48,  88, -40,
+ 68,  88,  77,  77,  85,  55,  88,  25,  62, -88,  48, -81,  33, -62,  17, -33,  // 8
+ 48, -25,   0, -77, -48, -87, -81, -48,  68,  -8,  88, -68,  81, -88,  48, -62,
+-81, -81, -77,   0,  -8,  81,  68,  68, -55,  88,  25,  25,  85, -68,  73, -81,
+-25,  48,  77,  77,  62, -40, -48, -81, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 88,  68,   0, -77, -88, -17,  25,  88,  48, -87, -81,  48, -25,  55,  88, -85,
+  0, -68, -77,   0,  77,  68,   0, -88,  77, -25,   0, -48, -77,  88,  77, -68,
+-88, -48,  77,  77, -33, -88, -25,  81, -40,  85,  81, -88, -87,  73,  55, -40,
+ 25,  81,   0, -77, -25,  73,  48, -68, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_16xN_coeff_hor[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  62, -68,  55, -81,  48, -88,  40, -88,  // 0
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -55,  73, -17,  88,  25,  68,  62,  17,
+ 81,  77,  25,   0, -48, -77, -88, -77,  48, -77, -25, -77, -81,   0, -81,  77,
+ 73,  68, -25, -48, -88, -81, -33,  25, -40,  81,  62,  48,  81, -68,  -8, -68,
+ 62,  55, -68, -81, -55, -17,  73,  88,  33, -85, -85,  -8, -25,  88,  87, -33,
+ 48,  40, -88, -88,  25,  62,  68,  17, -25,  87,  88, -33, -48, -48, -48,  88,
+ 33,  25, -81, -68,  85,  88, -40, -81,  17, -88, -73,  68,  88, -25, -55, -25,
+ 17,   8, -48, -25,  73,  40, -87, -55,  -8,  88,  40, -87, -68,  81,  85, -73,
+ 81,  25,  77,   0,  73, -25,  68, -48,  33, -81,  25, -68,  17, -48,   8, -25,  // 8
+-48, -88, -77, -77, -88, -33, -81,  25,  85, -40,  88, -81,  73, -87,  40, -55,
+-68,   0,   0,  77,  68,  77,  88,   0, -25,  77,  48,   0,  88, -77,  68, -77,
+ 68,  88,  77,   0, -17, -88, -88, -25, -87,  48, -48,  81,  55, -25,  85, -88,
+ 48, -25, -77, -77, -40,  62,  81,  48,  17, -73, -88,  68,  -8,  40,  88, -87,
+-81, -81,   0,  77,  81,  -8, -68, -68,  88, -55, -25, -25, -68,  85,  81, -73,
+-25,  48,  77,   0, -87, -48,  48,  81,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 88,  68, -77, -77,  55,  85, -25, -88, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+static const int16_t* ff_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+ALIGNED(32) static const int16_t  fi_dct2_16x2_coeff_hor[512] = {
+ 64,  90,  89,  87,  83,  80,  75,  70,  64,  90,  89,  87,  83,  80,  75,  70,  // 0
+ 64,  57,  50,  43,  36,  25,  18,   9,  64,  57,  50,  43,  36,  25,  18,   9,
+ 64,  87,  75,  57,  36,   9, -18, -43,  64,  87,  75,  57,  36,   9, -18, -43,
+-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25,
+ 64,  80,  50,   9, -36, -70, -89, -87,  64,  80,  50,   9, -36, -70, -89, -87,
+-64, -25,  18,  57,  83,  90,  75,  43, -64, -25,  18,  57,  83,  90,  75,  43,
+ 64,  70,  18, -43, -83, -87, -50,   9,  64,  70,  18, -43, -83, -87, -50,   9,
+ 64,  90,  75,  25, -36, -80, -89, -57,  64,  90,  75,  25, -36, -80, -89, -57,
+ 64,  57, -18, -80, -83, -25,  50,  90,  64,  57, -18, -80, -83, -25,  50,  90,  // 8
+ 64,  -9, -75, -87, -36,  43,  89,  70,  64,  -9, -75, -87, -36,  43,  89,  70,
+ 64,  43, -50, -90, -36,  57,  89,  25,  64,  43, -50, -90, -36,  57,  89,  25,
+-64, -87, -18,  70,  83,   9, -75, -80, -64, -87, -18,  70,  83,   9, -75, -80,
+ 64,  25, -75, -70,  36,  90,  18, -80,  64,  25, -75, -70,  36,  90,  18, -80,
+-64,  43,  89,   9, -83, -57,  50,  87, -64,  43,  89,   9, -83, -57,  50,  87,
+ 64,   9, -89, -25,  83,  43, -75, -57,  64,   9, -89, -25,  83,  43, -75, -57,
+ 64,  70, -50, -80,  36,  87, -18, -90,  64,  70, -50, -80,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  83, -43, -75,  57,  64,  -9, -89,  25,  83, -43, -75,  57,  // 16
+ 64, -70, -50,  80,  36, -87, -18,  90,  64, -70, -50,  80,  36, -87, -18,  90,
+ 64, -25, -75,  70,  36, -90,  18,  80,  64, -25, -75,  70,  36, -90,  18,  80,
+-64, -43,  89,  -9, -83,  57,  50, -87, -64, -43,  89,  -9, -83,  57,  50, -87,
+ 64, -43, -50,  90, -36, -57,  89, -25,  64, -43, -50,  90, -36, -57,  89, -25,
+-64,  87, -18, -70,  83,  -9, -75,  80, -64,  87, -18, -70,  83,  -9, -75,  80,
+ 64, -57, -18,  80, -83,  25,  50, -90,  64, -57, -18,  80, -83,  25,  50, -90,
+ 64,   9, -75,  87, -36, -43,  89, -70,  64,   9, -75,  87, -36, -43,  89, -70,
+ 64, -70,  18,  43, -83,  87, -50,  -9,  64, -70,  18,  43, -83,  87, -50,  -9,  // 24
+ 64, -90,  75, -25, -36,  80, -89,  57,  64, -90,  75, -25, -36,  80, -89,  57,
+ 64, -80,  50,  -9, -36,  70, -89,  87,  64, -80,  50,  -9, -36,  70, -89,  87,
+-64,  25,  18, -57,  83, -90,  75, -43, -64,  25,  18, -57,  83, -90,  75, -43,
+ 64, -87,  75, -57,  36,  -9, -18,  43,  64, -87,  75, -57,  36,  -9, -18,  43,
+-64,  80, -89,  90, -83,  70, -50,  25, -64,  80, -89,  90, -83,  70, -50,  25,
+ 64, -90,  89, -87,  83, -80,  75, -70,  64, -90,  89, -87,  83, -80,  75, -70,
+ 64, -57,  50, -43,  36, -25,  18,  -9,  64, -57,  50, -43,  36, -25,  18,  -9,
+};
+
+          static const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+
+ALIGNED(32) static const int16_t  fi_dst7_16x2_coeff_hor[512] = {
+  8,  25,  40,  55,  68,  77,  85,  88,   8,  25,  40,  55,  68,  77,  85,  88,  // 0
+ 88,  87,  81,  73,  62,  48,  33,  17,  88,  87,  81,  73,  62,  48,  33,  17, 
+ 17,  48,  73,  87,  88,  77,  55,  25,  17,  48,  73,  87,  88,  77,  55,  25, 
+ -8, -40, -68, -85, -88, -81, -62, -33,  -8, -40, -68, -85, -88, -81, -62, -33, 
+ 25,  68,  88,  81,  48,   0, -48, -81,  25,  68,  88,  81,  48,   0, -48, -81, 
+-88, -68, -25,  25,  68,  88,  81,  48, -88, -68, -25,  25,  68,  88,  81,  48, 
+ 33,  81,  85,  40, -25, -77, -87, -48,  33,  81,  85,  40, -25, -77, -87, -48, 
+ 17,  73,  88,  55,  -8, -68, -88, -62,  17,  73,  88,  55,  -8, -68, -88, -62, 
+ 40,  88,  62, -17, -81, -77,  -8,  68,  40,  88,  62, -17, -81, -77,  -8,  68,  // 8
+ 87,  33, -48, -88, -55,  25,  85,  73,  87,  33, -48, -88, -55,  25,  85,  73, 
+ 48,  88,  25, -68, -81,   0,  81,  68,  48,  88,  25, -68, -81,   0,  81,  68, 
+-25, -88, -48,  48,  88,  25, -68, -81, -25, -88, -48,  48,  88,  25, -68, -81, 
+ 55,  81, -17, -88, -25,  77,  62, -48,  55,  81, -17, -88, -25,  77,  62, -48, 
+-85,   8,  88,  33, -73, -68,  40,  87, -85,   8,  88,  33, -73, -68,  40,  87, 
+ 62,  68, -55, -73,  48,  77, -40, -81,  62,  68, -55, -73,  48,  77, -40, -81, 
+ 33,  85, -25, -87,  17,  88,  -8, -88,  33,  85, -25, -87,  17,  88,  -8, -88, 
+ 68,  48, -81, -25,  88,   0, -88,  25,  68,  48, -81, -25,  88,   0, -88,  25,  // 16
+ 81, -48, -68,  68,  48, -81, -25,  88,  81, -48, -68,  68,  48, -81, -25,  88, 
+ 73,  25, -88,  33,  68, -77, -17,  88,  73,  25, -88,  33,  68, -77, -17,  88, 
+-40, -62,  81,   8, -87,  48,  55, -85, -40, -62,  81,   8, -87,  48,  55, -85, 
+ 77,   0, -77,  77,   0, -77,  77,   0,  77,   0, -77,  77,   0, -77,  77,   0, 
+-77,  77,   0, -77,  77,   0, -77,  77, -77,  77,   0, -77,  77,   0, -77,  77, 
+ 81, -25, -48,  88, -68,   0,  68, -88,  81, -25, -48,  88, -68,   0,  68, -88, 
+ 48,  25, -81,  81, -25, -48,  88, -68,  48,  25, -81,  81, -25, -48,  88, -68, 
+ 85, -48,  -8,  62, -88,  77, -33, -25,  85, -48,  -8,  62, -88,  77, -33, -25,  // 24
+ 73, -88,  68, -17, -40,  81, -87,  55,  73, -88,  68, -17, -40,  81, -87,  55, 
+ 87, -68,  33,   8, -48,  77, -88,  81,  87, -68,  33,   8, -48,  77, -88,  81, 
+-55,  17,  25, -62,  85, -88,  73, -40, -55,  17,  25, -62,  85, -88,  73, -40, 
+ 88, -81,  68, -48,  25,   0, -25,  48,  88, -81,  68, -48,  25,   0, -25,  48, 
+-68,  81, -88,  88, -81,  68, -48,  25, -68,  81, -88,  88, -81,  68, -48,  25, 
+ 88, -88,  87, -85,  81, -77,  73, -68,  88, -88,  87, -85,  81, -77,  73, -68, 
+ 62, -55,  48, -40,  33, -25,  17,  -8,  62, -55,  48, -40,  33, -25,  17,  -8, 
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
+ 80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  // 8
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  // 16
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  // 24
+-43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+-57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  // 32
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  // 40
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,
+-87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+ 25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  // 48
+-70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,
+-57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57,
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
+  9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,   9,  -9,  // 56
+-25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25, -25,  25,
+ 43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,  43, -43,
+-57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57, -57,  57,
+ 70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,  70, -70,
+-80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80, -80,  80,
+ 87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,  87, -87,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_16x4_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_16x4_coeff_ver[128] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
+ 74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
+ 74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+ 84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
+-74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55,
+ 55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,
+ 74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_16x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_16x4_coeff_hor[1024] = {
+ 64,  90,  89,  87,  64,  90,  89,  87,  64,  90,  89,  87,  64,  90,  89,  87,  // 0
+ 83,  80,  75,  70,  83,  80,  75,  70,  83,  80,  75,  70,  83,  80,  75,  70,
+ 64,  57,  50,  43,  64,  57,  50,  43,  64,  57,  50,  43,  64,  57,  50,  43,
+ 36,  25,  18,   9,  36,  25,  18,   9,  36,  25,  18,   9,  36,  25,  18,   9,
+ 64,  87,  75,  57,  64,  87,  75,  57,  64,  87,  75,  57,  64,  87,  75,  57,
+ 36,   9, -18, -43,  36,   9, -18, -43,  36,   9, -18, -43,  36,   9, -18, -43,
+-64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90,
+-83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25,
+ 64,  80,  50,   9,  64,  80,  50,   9,  64,  80,  50,   9,  64,  80,  50,   9,  // 8
+-36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87,
+-64, -25,  18,  57, -64, -25,  18,  57, -64, -25,  18,  57, -64, -25,  18,  57,
+ 83,  90,  75,  43,  83,  90,  75,  43,  83,  90,  75,  43,  83,  90,  75,  43,
+ 64,  70,  18, -43,  64,  70,  18, -43,  64,  70,  18, -43,  64,  70,  18, -43,
+-83, -87, -50,   9, -83, -87, -50,   9, -83, -87, -50,   9, -83, -87, -50,   9,
+ 64,  90,  75,  25,  64,  90,  75,  25,  64,  90,  75,  25,  64,  90,  75,  25,
+-36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57,
+ 64,  57, -18, -80,  64,  57, -18, -80,  64,  57, -18, -80,  64,  57, -18, -80,  // 16
+-83, -25,  50,  90, -83, -25,  50,  90, -83, -25,  50,  90, -83, -25,  50,  90,
+ 64,  -9, -75, -87,  64,  -9, -75, -87,  64,  -9, -75, -87,  64,  -9, -75, -87,
+-36,  43,  89,  70, -36,  43,  89,  70, -36,  43,  89,  70, -36,  43,  89,  70,
+ 64,  43, -50, -90,  64,  43, -50, -90,  64,  43, -50, -90,  64,  43, -50, -90,
+-36,  57,  89,  25, -36,  57,  89,  25, -36,  57,  89,  25, -36,  57,  89,  25,
+-64, -87, -18,  70, -64, -87, -18,  70, -64, -87, -18,  70, -64, -87, -18,  70,
+ 83,   9, -75, -80,  83,   9, -75, -80,  83,   9, -75, -80,  83,   9, -75, -80,
+ 64,  25, -75, -70,  64,  25, -75, -70,  64,  25, -75, -70,  64,  25, -75, -70,  // 24
+ 36,  90,  18, -80,  36,  90,  18, -80,  36,  90,  18, -80,  36,  90,  18, -80,
+-64,  43,  89,   9, -64,  43,  89,   9, -64,  43,  89,   9, -64,  43,  89,   9,
+-83, -57,  50,  87, -83, -57,  50,  87, -83, -57,  50,  87, -83, -57,  50,  87,
+ 64,   9, -89, -25,  64,   9, -89, -25,  64,   9, -89, -25,  64,   9, -89, -25,
+ 83,  43, -75, -57,  83,  43, -75, -57,  83,  43, -75, -57,  83,  43, -75, -57,
+ 64,  70, -50, -80,  64,  70, -50, -80,  64,  70, -50, -80,  64,  70, -50, -80,
+ 36,  87, -18, -90,  36,  87, -18, -90,  36,  87, -18, -90,  36,  87, -18, -90,
+ 64,  -9, -89,  25,  64,  -9, -89,  25,  64,  -9, -89,  25,  64,  -9, -89,  25,  // 32
+ 83, -43, -75,  57,  83, -43, -75,  57,  83, -43, -75,  57,  83, -43, -75,  57,
+ 64, -70, -50,  80,  64, -70, -50,  80,  64, -70, -50,  80,  64, -70, -50,  80,
+ 36, -87, -18,  90,  36, -87, -18,  90,  36, -87, -18,  90,  36, -87, -18,  90,
+ 64, -25, -75,  70,  64, -25, -75,  70,  64, -25, -75,  70,  64, -25, -75,  70,
+ 36, -90,  18,  80,  36, -90,  18,  80,  36, -90,  18,  80,  36, -90,  18,  80,
+-64, -43,  89,  -9, -64, -43,  89,  -9, -64, -43,  89,  -9, -64, -43,  89,  -9,
+-83,  57,  50, -87, -83,  57,  50, -87, -83,  57,  50, -87, -83,  57,  50, -87,
+ 64, -43, -50,  90,  64, -43, -50,  90,  64, -43, -50,  90,  64, -43, -50,  90,  // 40
+-36, -57,  89, -25, -36, -57,  89, -25, -36, -57,  89, -25, -36, -57,  89, -25,
+-64,  87, -18, -70, -64,  87, -18, -70, -64,  87, -18, -70, -64,  87, -18, -70,
+ 83,  -9, -75,  80,  83,  -9, -75,  80,  83,  -9, -75,  80,  83,  -9, -75,  80,
+ 64, -57, -18,  80,  64, -57, -18,  80,  64, -57, -18,  80,  64, -57, -18,  80,
+-83,  25,  50, -90, -83,  25,  50, -90, -83,  25,  50, -90, -83,  25,  50, -90,
+ 64,   9, -75,  87,  64,   9, -75,  87,  64,   9, -75,  87,  64,   9, -75,  87,
+-36, -43,  89, -70, -36, -43,  89, -70, -36, -43,  89, -70, -36, -43,  89, -70,
+ 64, -70,  18,  43,  64, -70,  18,  43,  64, -70,  18,  43,  64, -70,  18,  43,  // 48
+-83,  87, -50,  -9, -83,  87, -50,  -9, -83,  87, -50,  -9, -83,  87, -50,  -9,
+ 64, -90,  75, -25,  64, -90,  75, -25,  64, -90,  75, -25,  64, -90,  75, -25,
+-36,  80, -89,  57, -36,  80, -89,  57, -36,  80, -89,  57, -36,  80, -89,  57,
+ 64, -80,  50,  -9,  64, -80,  50,  -9,  64, -80,  50,  -9,  64, -80,  50,  -9,
+-36,  70, -89,  87, -36,  70, -89,  87, -36,  70, -89,  87, -36,  70, -89,  87,
+-64,  25,  18, -57, -64,  25,  18, -57, -64,  25,  18, -57, -64,  25,  18, -57,
+ 83, -90,  75, -43,  83, -90,  75, -43,  83, -90,  75, -43,  83, -90,  75, -43,
+ 64, -87,  75, -57,  64, -87,  75, -57,  64, -87,  75, -57,  64, -87,  75, -57,  // 56
+ 36,  -9, -18,  43,  36,  -9, -18,  43,  36,  -9, -18,  43,  36,  -9, -18,  43,
+-64,  80, -89,  90, -64,  80, -89,  90, -64,  80, -89,  90, -64,  80, -89,  90,
+-83,  70, -50,  25, -83,  70, -50,  25, -83,  70, -50,  25, -83,  70, -50,  25,
+ 64, -90,  89, -87,  64, -90,  89, -87,  64, -90,  89, -87,  64, -90,  89, -87,
+ 83, -80,  75, -70,  83, -80,  75, -70,  83, -80,  75, -70,  83, -80,  75, -70,
+ 64, -57,  50, -43,  64, -57,  50, -43,  64, -57,  50, -43,  64, -57,  50, -43,
+ 36, -25,  18,  -9,  36, -25,  18,  -9,  36, -25,  18,  -9,  36, -25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_16x4_coeff_hor[1024] = {
+  8,  25,  40,  55,   8,  25,  40,  55,   8,  25,  40,  55,   8,  25,  40,  55,  // 0
+ 68,  77,  85,  88,  68,  77,  85,  88,  68,  77,  85,  88,  68,  77,  85,  88,
+ 88,  87,  81,  73,  88,  87,  81,  73,  88,  87,  81,  73,  88,  87,  81,  73,
+ 62,  48,  33,  17,  62,  48,  33,  17,  62,  48,  33,  17,  62,  48,  33,  17,
+ 17,  48,  73,  87,  17,  48,  73,  87,  17,  48,  73,  87,  17,  48,  73,  87,
+ 88,  77,  55,  25,  88,  77,  55,  25,  88,  77,  55,  25,  88,  77,  55,  25,
+ -8, -40, -68, -85,  -8, -40, -68, -85,  -8, -40, -68, -85,  -8, -40, -68, -85,
+-88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33,
+ 25,  68,  88,  81,  25,  68,  88,  81,  25,  68,  88,  81,  25,  68,  88,  81,  // 8
+ 48,   0, -48, -81,  48,   0, -48, -81,  48,   0, -48, -81,  48,   0, -48, -81,
+-88, -68, -25,  25, -88, -68, -25,  25, -88, -68, -25,  25, -88, -68, -25,  25,
+ 68,  88,  81,  48,  68,  88,  81,  48,  68,  88,  81,  48,  68,  88,  81,  48,
+ 33,  81,  85,  40,  33,  81,  85,  40,  33,  81,  85,  40,  33,  81,  85,  40,
+-25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48,
+ 17,  73,  88,  55,  17,  73,  88,  55,  17,  73,  88,  55,  17,  73,  88,  55,
+ -8, -68, -88, -62,  -8, -68, -88, -62,  -8, -68, -88, -62,  -8, -68, -88, -62,
+ 40,  88,  62, -17,  40,  88,  62, -17,  40,  88,  62, -17,  40,  88,  62, -17,  // 16
+-81, -77,  -8,  68, -81, -77,  -8,  68, -81, -77,  -8,  68, -81, -77,  -8,  68,
+ 87,  33, -48, -88,  87,  33, -48, -88,  87,  33, -48, -88,  87,  33, -48, -88,
+-55,  25,  85,  73, -55,  25,  85,  73, -55,  25,  85,  73, -55,  25,  85,  73,
+ 48,  88,  25, -68,  48,  88,  25, -68,  48,  88,  25, -68,  48,  88,  25, -68,
+-81,   0,  81,  68, -81,   0,  81,  68, -81,   0,  81,  68, -81,   0,  81,  68,
+-25, -88, -48,  48, -25, -88, -48,  48, -25, -88, -48,  48, -25, -88, -48,  48,
+ 88,  25, -68, -81,  88,  25, -68, -81,  88,  25, -68, -81,  88,  25, -68, -81,
+ 55,  81, -17, -88,  55,  81, -17, -88,  55,  81, -17, -88,  55,  81, -17, -88,  // 24
+-25,  77,  62, -48, -25,  77,  62, -48, -25,  77,  62, -48, -25,  77,  62, -48,
+-85,   8,  88,  33, -85,   8,  88,  33, -85,   8,  88,  33, -85,   8,  88,  33,
+-73, -68,  40,  87, -73, -68,  40,  87, -73, -68,  40,  87, -73, -68,  40,  87,
+ 62,  68, -55, -73,  62,  68, -55, -73,  62,  68, -55, -73,  62,  68, -55, -73,
+ 48,  77, -40, -81,  48,  77, -40, -81,  48,  77, -40, -81,  48,  77, -40, -81,
+ 33,  85, -25, -87,  33,  85, -25, -87,  33,  85, -25, -87,  33,  85, -25, -87,
+ 17,  88,  -8, -88,  17,  88,  -8, -88,  17,  88,  -8, -88,  17,  88,  -8, -88,
+ 68,  48, -81, -25,  68,  48, -81, -25,  68,  48, -81, -25,  68,  48, -81, -25,  // 32
+ 88,   0, -88,  25,  88,   0, -88,  25,  88,   0, -88,  25,  88,   0, -88,  25,
+ 81, -48, -68,  68,  81, -48, -68,  68,  81, -48, -68,  68,  81, -48, -68,  68,
+ 48, -81, -25,  88,  48, -81, -25,  88,  48, -81, -25,  88,  48, -81, -25,  88,
+ 73,  25, -88,  33,  73,  25, -88,  33,  73,  25, -88,  33,  73,  25, -88,  33,
+ 68, -77, -17,  88,  68, -77, -17,  88,  68, -77, -17,  88,  68, -77, -17,  88,
+-40, -62,  81,   8, -40, -62,  81,   8, -40, -62,  81,   8, -40, -62,  81,   8,
+-87,  48,  55, -85, -87,  48,  55, -85, -87,  48,  55, -85, -87,  48,  55, -85,
+ 77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,  // 40
+  0, -77,  77,   0,   0, -77,  77,   0,   0, -77,  77,   0,   0, -77,  77,   0,
+-77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77,
+ 77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,  77,   0, -77,  77,
+ 81, -25, -48,  88,  81, -25, -48,  88,  81, -25, -48,  88,  81, -25, -48,  88,
+-68,   0,  68, -88, -68,   0,  68, -88, -68,   0,  68, -88, -68,   0,  68, -88,
+ 48,  25, -81,  81,  48,  25, -81,  81,  48,  25, -81,  81,  48,  25, -81,  81,
+-25, -48,  88, -68, -25, -48,  88, -68, -25, -48,  88, -68, -25, -48,  88, -68,
+ 85, -48,  -8,  62,  85, -48,  -8,  62,  85, -48,  -8,  62,  85, -48,  -8,  62,  // 48
+-88,  77, -33, -25, -88,  77, -33, -25, -88,  77, -33, -25, -88,  77, -33, -25,
+ 73, -88,  68, -17,  73, -88,  68, -17,  73, -88,  68, -17,  73, -88,  68, -17,
+-40,  81, -87,  55, -40,  81, -87,  55, -40,  81, -87,  55, -40,  81, -87,  55,
+ 87, -68,  33,   8,  87, -68,  33,   8,  87, -68,  33,   8,  87, -68,  33,   8,
+-48,  77, -88,  81, -48,  77, -88,  81, -48,  77, -88,  81, -48,  77, -88,  81,
+-55,  17,  25, -62, -55,  17,  25, -62, -55,  17,  25, -62, -55,  17,  25, -62,
+ 85, -88,  73, -40,  85, -88,  73, -40,  85, -88,  73, -40,  85, -88,  73, -40,
+ 88, -81,  68, -48,  88, -81,  68, -48,  88, -81,  68, -48,  88, -81,  68, -48,  // 56
+ 25,   0, -25,  48,  25,   0, -25,  48,  25,   0, -25,  48,  25,   0, -25,  48,
+-68,  81, -88,  88, -68,  81, -88,  88, -68,  81, -88,  88, -68,  81, -88,  88,
+-81,  68, -48,  25, -81,  68, -48,  25, -81,  68, -48,  25, -81,  68, -48,  25,
+ 88, -88,  87, -85,  88, -88,  87, -85,  88, -88,  87, -85,  88, -88,  87, -85,
+ 81, -77,  73, -68,  81, -77,  73, -68,  81, -77,  73, -68,  81, -77,  73, -68,
+ 62, -55,  48, -40,  62, -55,  48, -40,  62, -55,  48, -40,  62, -55,  48, -40,
+ 33, -25,  17,  -8,  33, -25,  17,  -8,  33, -25,  17,  -8,  33, -25,  17,  -8,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_16x4_coeff_hor[1024] = {
+ 88,  88,  87,  85,  88,  88,  87,  85,  88,  88,  87,  85,  88,  88,  87,  85,  // 0
+ 81,  77,  73,  68,  81,  77,  73,  68,  81,  77,  73,  68,  81,  77,  73,  68,
+ 62,  55,  48,  40,  62,  55,  48,  40,  62,  55,  48,  40,  62,  55,  48,  40,
+ 33,  25,  17,   8,  33,  25,  17,   8,  33,  25,  17,   8,  33,  25,  17,   8,
+ 88,  81,  68,  48,  88,  81,  68,  48,  88,  81,  68,  48,  88,  81,  68,  48,
+ 25,   0, -25, -48,  25,   0, -25, -48,  25,   0, -25, -48,  25,   0, -25, -48,
+-68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88,
+-81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25,
+ 87,  68,  33,  -8,  87,  68,  33,  -8,  87,  68,  33,  -8,  87,  68,  33,  -8,  // 8
+-48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81,
+-55, -17,  25,  62, -55, -17,  25,  62, -55, -17,  25,  62, -55, -17,  25,  62,
+ 85,  88,  73,  40,  85,  88,  73,  40,  85,  88,  73,  40,  85,  88,  73,  40,
+ 85,  48,  -8, -62,  85,  48,  -8, -62,  85,  48,  -8, -62,  85,  48,  -8, -62,
+-88, -77, -33,  25, -88, -77, -33,  25, -88, -77, -33,  25, -88, -77, -33,  25,
+ 73,  88,  68,  17,  73,  88,  68,  17,  73,  88,  68,  17,  73,  88,  68,  17,
+-40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55,
+ 81,  25, -48, -88,  81,  25, -48, -88,  81,  25, -48, -88,  81,  25, -48, -88,  // 16
+-68,   0,  68,  88, -68,   0,  68,  88, -68,   0,  68,  88, -68,   0,  68,  88,
+ 48, -25, -81, -81,  48, -25, -81, -81,  48, -25, -81, -81,  48, -25, -81, -81,
+-25,  48,  88,  68, -25,  48,  88,  68, -25,  48,  88,  68, -25,  48,  88,  68,
+ 77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,
+  0,  77,  77,   0,   0,  77,  77,   0,   0,  77,  77,   0,   0,  77,  77,   0,
+-77, -77,   0,  77, -77, -77,   0,  77, -77, -77,   0,  77, -77, -77,   0,  77,
+ 77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,  77,   0, -77, -77,
+ 73, -25, -88, -33,  73, -25, -88, -33,  73, -25, -88, -33,  73, -25, -88, -33,  // 24
+ 68,  77, -17, -88,  68,  77, -17, -88,  68,  77, -17, -88,  68,  77, -17, -88,
+-40,  62,  81,  -8, -40,  62,  81,  -8, -40,  62,  81,  -8, -40,  62,  81,  -8,
+-87, -48,  55,  85, -87, -48,  55,  85, -87, -48,  55,  85, -87, -48,  55,  85,
+ 68, -48, -81,  25,  68, -48, -81,  25,  68, -48, -81,  25,  68, -48, -81,  25,
+ 88,   0, -88, -25,  88,   0, -88, -25,  88,   0, -88, -25,  88,   0, -88, -25,
+ 81,  48, -68, -68,  81,  48, -68, -68,  81,  48, -68, -68,  81,  48, -68, -68,
+ 48,  81, -25, -88,  48,  81, -25, -88,  48,  81, -25, -88,  48,  81, -25, -88,
+ 62, -68, -55,  73,  62, -68, -55,  73,  62, -68, -55,  73,  62, -68, -55,  73,  // 32
+ 48, -77, -40,  81,  48, -77, -40,  81,  48, -77, -40,  81,  48, -77, -40,  81,
+ 33, -85, -25,  87,  33, -85, -25,  87,  33, -85, -25,  87,  33, -85, -25,  87,
+ 17, -88,  -8,  88,  17, -88,  -8,  88,  17, -88,  -8,  88,  17, -88,  -8,  88,
+ 55, -81, -17,  88,  55, -81, -17,  88,  55, -81, -17,  88,  55, -81, -17,  88,
+-25, -77,  62,  48, -25, -77,  62,  48, -25, -77,  62,  48, -25, -77,  62,  48,
+-85,  -8,  88, -33, -85,  -8,  88, -33, -85,  -8,  88, -33, -85,  -8,  88, -33,
+-73,  68,  40, -87, -73,  68,  40, -87, -73,  68,  40, -87, -73,  68,  40, -87,
+ 48, -88,  25,  68,  48, -88,  25,  68,  48, -88,  25,  68,  48, -88,  25,  68,  // 40
+-81,   0,  81, -68, -81,   0,  81, -68, -81,   0,  81, -68, -81,   0,  81, -68,
+-25,  88, -48, -48, -25,  88, -48, -48, -25,  88, -48, -48, -25,  88, -48, -48,
+ 88, -25, -68,  81,  88, -25, -68,  81,  88, -25, -68,  81,  88, -25, -68,  81,
+ 40, -88,  62,  17,  40, -88,  62,  17,  40, -88,  62,  17,  40, -88,  62,  17,
+-81,  77,  -8, -68, -81,  77,  -8, -68, -81,  77,  -8, -68, -81,  77,  -8, -68,
+ 87, -33, -48,  88,  87, -33, -48,  88,  87, -33, -48,  88,  87, -33, -48,  88,
+-55, -25,  85, -73, -55, -25,  85, -73, -55, -25,  85, -73, -55, -25,  85, -73,
+ 33, -81,  85, -40,  33, -81,  85, -40,  33, -81,  85, -40,  33, -81,  85, -40,  // 48
+-25,  77, -87,  48, -25,  77, -87,  48, -25,  77, -87,  48, -25,  77, -87,  48,
+ 17, -73,  88, -55,  17, -73,  88, -55,  17, -73,  88, -55,  17, -73,  88, -55,
+ -8,  68, -88,  62,  -8,  68, -88,  62,  -8,  68, -88,  62,  -8,  68, -88,  62,
+ 25, -68,  88, -81,  25, -68,  88, -81,  25, -68,  88, -81,  25, -68,  88, -81,
+ 48,   0, -48,  81,  48,   0, -48,  81,  48,   0, -48,  81,  48,   0, -48,  81,
+-88,  68, -25, -25, -88,  68, -25, -25, -88,  68, -25, -25, -88,  68, -25, -25,
+ 68, -88,  81, -48,  68, -88,  81, -48,  68, -88,  81, -48,  68, -88,  81, -48,
+ 17, -48,  73, -87,  17, -48,  73, -87,  17, -48,  73, -87,  17, -48,  73, -87,  // 56
+ 88, -77,  55, -25,  88, -77,  55, -25,  88, -77,  55, -25,  88, -77,  55, -25,
+ -8,  40, -68,  85,  -8,  40, -68,  85,  -8,  40, -68,  85,  -8,  40, -68,  85,
+-88,  81, -62,  33, -88,  81, -62,  33, -88,  81, -62,  33, -88,  81, -62,  33,
+  8, -25,  40, -55,   8, -25,  40, -55,   8, -25,  40, -55,   8, -25,  40, -55,
+ 68, -77,  85, -88,  68, -77,  85, -88,  68, -77,  85, -88,  68, -77,  85, -88,
+ 88, -87,  81, -73,  88, -87,  81, -73,  88, -87,  81, -73,  88, -87,  81, -73,
+ 62, -48,  33, -17,  62, -48,  33, -17,  62, -48,  33, -17,  62, -48,  33, -17,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_16x4_coeff_ver[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_16x4_coeff_ver[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_16x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_16x8_coeff_ver[64] = {
+ 64,  64,  89,  75,  83,  36,  75, -18,  64, -64,  50, -89,  36, -83,  18, -50,
+ 64,  64,  50,  18, -36, -83, -89, -50, -64,  64,  18,  75,  83, -36,  75, -89,
+ 64,  64, -18, -50, -83, -36,  50,  89,  64, -64, -75, -18, -36,  83,  89, -75,
+ 64,  64, -75, -89,  36,  83,  18, -75, -64,  64,  89, -50, -83,  36,  50, -18,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_16x8_coeff_ver[64] = {
+ 17,  32,  46,  78,  71,  85,  85,  46,  86, -17,  78, -71,  60, -86,  32, -60,
+ 46,  60,  86,  71,  32, -46, -60, -78, -85,  32, -17,  85,  71, -17,  78, -86,
+ 71,  78,  32, -17, -86, -60,  17,  86,  78, -46, -60, -32, -46,  85,  85, -71,
+ 85,  86, -60, -85,  17,  78,  32, -71, -71,  60,  86, -46, -78,  32,  46, -17,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_16x8_coeff_ver[64] = {
+ 86,  85,  85,  60,  78,  17,  71, -32,  60, -71,  46, -86,  32, -78,  17, -46,
+ 78,  71,  17, -32, -60, -86, -86, -17, -46,  78,  32,  60,  85, -46,  71, -85,
+ 60,  46, -71, -86, -46,  32,  78,  60,  32, -85, -85,  17, -17,  71,  86, -78,
+ 32,  17, -78, -46,  85,  71, -46, -85, -17,  86,  71, -78, -86,  60,  60, -32,
+};
+
+ALIGNED(32) static const int16_t  ff_dct2_16x8_butterfly_coeff_ver[128] = {
+   64,  64,  89,  75,  83,  36,  75, -18,  64,  64,  89,  75,  83,  36,  75, -18,
+   64,  64,  50,  18, -36, -83, -89, -50,  64,  64,  50,  18, -36, -83, -89, -50,
+   64,  64, -18, -50, -83, -36,  50,  89,  64,  64, -18, -50, -83, -36,  50,  89,
+   64,  64, -75, -89,  36,  83,  18, -75,  64,  64, -75, -89,  36,  83,  18, -75,
+   64, -64,  50, -89,  36, -83,  18, -50,  64, -64,  50, -89,  36, -83,  18, -50,
+  -64,  64,  18,  75,  83, -36,  75, -89, -64,  64,  18,  75,  83, -36,  75, -89,
+   64, -64, -75, -18, -36,  83,  89, -75,  64, -64, -75, -18, -36,  83,  89, -75,
+  -64,  64,  89, -50, -83,  36,  50, -18, -64,  64,  89, -50, -83,  36,  50, -18
+};
+
+ALIGNED(32) static const int16_t  ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = {
+ 89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  // 0
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  // 8
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+ 18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,  18, -18,
+-50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50, -50,  50,
+ 75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,  75, -75,
+-89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89, -89,  89,
+};
+
+
+          static const int16_t* fi_dct2_16x8_coeff_hor = fi_dct2_8x16_coeff_ver; // Duplicate table.
+           
+          static const int16_t* fi_dst7_16x8_coeff_hor = fi_dst7_8x16_coeff_ver; // Duplicate table.
+           
+          static const int16_t* fi_dct8_16x8_coeff_hor = fi_dct8_8x16_coeff_ver; // Duplicate table.
+
+
+          static const int16_t* fi_dct2_16x8_coeff_ver = fi_dct2_8x8_coeff_hor;  // Duplicate table
+           
+          static const int16_t* fi_dst7_16x8_coeff_ver = fi_dst7_8x8_coeff_hor;  // Duplicate table
+           
+          static const int16_t* fi_dct8_16x8_coeff_ver = fi_dct8_8x8_coeff_hor;  // Duplicate table
+
+
+ALIGNED(32) static const int16_t  ff_dct2_16x16_coeff_ver[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
+ 64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
+ 64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
+-64,  64, -25,  90,  18,  75,  57,  25,  83, -36,  90, -80,  75, -89,  43, -57,
+ 64,  64,  57,  43, -18, -50, -80, -90, -83, -36, -25,  57,  50,  89,  90,  25,
+ 64, -64,  -9, -87, -75, -18, -87,  70, -36,  83,  43,   9,  89, -75,  70, -80,
+ 64,  64,  25,   9, -75, -89, -70, -25,  36,  83,  90,  43,  18, -75, -80, -57,
+-64,  64,  43,  70,  89, -50,   9, -80, -83,  36, -57,  87,  50, -18,  87, -90,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  83,  36, -43, -90, -75,  18,  57,  80,  // 8
+ 64, -64, -70, -43, -50,  89,  80,  -9,  36, -83, -87,  57, -18,  50,  90, -87,
+ 64,  64, -43, -57, -50, -18,  90,  80, -36, -83, -57,  25,  89,  50, -25, -90,
+-64,  64,  87,   9, -18, -75, -70,  87,  83, -36,  -9, -43, -75,  89,  80, -70,
+ 64,  64, -70, -80,  18,  50,  43,  -9, -83, -36,  87,  70, -50, -89,  -9,  87,
+ 64, -64, -90,  25,  75,  18, -25, -57, -36,  83,  80, -90, -89,  75,  57, -43,
+ 64,  64, -87, -90,  75,  89, -57, -87,  36,  83,  -9, -80, -18,  75,  43, -70,
+-64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_16x16_coeff_ver[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
+ 88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
+ 25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
+-88,  17, -68,  73, -25,  88,  25,  55,  68,  -8,  88, -68,  81, -88,  48, -62,
+ 40,  48,  88,  88,  62,  25, -17, -68, -81, -81, -77,   0,  -8,  81,  68,  68,
+ 87, -25,  33, -88, -48, -48, -88,  48, -55,  88,  25,  25,  85, -68,  73, -81,
+ 55,  62,  81,  68, -17, -55, -88, -73, -25,  48,  77,  77,  62, -40, -48, -81,
+-85,  33,   8,  85,  88, -25,  33, -87, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 68,  73,  48,  25, -81, -88, -25,  33,  88,  68,   0, -77, -88, -17,  25,  88,  // 8
+ 81, -40, -48, -62, -68,  81,  68,   8,  48, -87, -81,  48, -25,  55,  88, -85,
+ 77,  81,   0, -25, -77, -48,  77,  88,   0, -68, -77,   0,  77,  68,   0, -88,
+-77,  48,  77,  25,   0, -81, -77,  81,  77, -25,   0, -48, -77,  88,  77, -68,
+ 85,  87, -48, -68,  -8,  33,  62,   8, -88, -48,  77,  77, -33, -88, -25,  81,
+ 73, -55, -88,  17,  68,  25, -17, -62, -40,  85,  81, -88, -87,  73,  55, -40,
+ 88,  88, -81, -88,  68,  87, -48, -85,  25,  81,   0, -77, -25,  73,  48, -68,
+-68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_16x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_16x16_coeff_hor[256] = {
+ 64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
+ 64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
+ 89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
+-89,  25, -75,  70, -50,  90, -18,  80,  18,  43,  50,  -9,  75, -57,  89, -87,
+ 83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
+ 83, -43,  36, -90, -36, -57, -83,  25, -83,  87, -36,  70,  36,  -9,  83, -80,
+ 75,  70, -18, -43, -89, -87, -50,   9,  50,  90,  89,  25,  18, -80, -75, -57,
+-75,  57,  18,  80,  89, -25,  50, -90, -50,  -9, -89,  87, -18,  43,  75, -70,
+ 64,  57, -64, -80, -64, -25,  64,  90,  64,  -9, -64, -87, -64,  43,  64,  70,  // 8
+ 64, -70, -64, -43, -64,  87,  64,   9,  64, -90, -64,  25, -64,  80,  64, -57,
+ 50,  43, -89, -90,  18,  57,  75,  25, -75, -87, -18,  70,  89,   9, -50, -80,
+-50,  80,  89,  -9, -18, -70, -75,  87,  75, -25,  18, -57, -89,  90,  50, -43,
+ 36,  25, -83, -70,  83,  90, -36, -80, -36,  43,  83,   9, -83, -57,  36,  87,
+ 36, -87, -83,  57,  83,  -9, -36, -43, -36,  80,  83, -90, -83,  70,  36, -25,
+ 18,   9, -50, -25,  75,  43, -89, -57,  89,  70, -75, -80,  50,  87, -18, -90,
+-18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_16x16_coeff_hor[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
+ 68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
+ 40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
+-81, -25, -88,  33, -77,  77, -48,  88,  -8,  62,  33,   8,  68, -48,  87, -85,
+ 68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
+ 88,   0,  68, -77,   0, -77, -68,   0, -88,  77, -48,  77,  25,   0,  81, -77,
+ 85,  88,  55,  25, -48, -81, -87, -48,  -8,  68,  81,  68,  62, -48, -40, -81,
+-88,  25, -17,  88,  77,   0,  68, -88, -33, -25, -88,  81, -25,  48,  73, -68,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  87,  33, -25, -88, -85,   8,  33,  85,  // 8
+ 81, -48, -40, -62, -77,  77,  48,  25,  73, -88, -55,  17, -68,  81,  62, -55,
+ 81,  73, -68, -85, -25,  25,  88,  55, -48, -88, -48,  48,  88,  33, -25, -87,
+-68,  68,  81,   8,   0, -77, -81,  81,  68, -17,  25, -62, -88,  88,  48, -40,
+ 62,  48, -88, -81,  68,  88,  -8, -68, -55,  25,  88,  25, -73, -68,  17,  88,
+ 48, -81, -87,  48,  77,   0, -25, -48, -40,  81,  85, -88, -81,  68,  33, -25,
+ 33,  17, -62, -33,  81,  48, -88, -62,  85,  73, -68, -81,  40,  87,  -8, -88,
+-25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+ALIGNED(32) static const int16_t  fi_dct2_16x1_coeff_hor[256] = {
+ 64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
+ 89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
+ 83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
+ 75,  70, -18, -43, -89, -87, -50,   9,  50,  90,  89,  25,  18, -80, -75, -57,
+ 64,  57, -64, -80, -64, -25,  64,  90,  64,  -9, -64, -87, -64,  43,  64,  70,  // 8
+ 50,  43, -89, -90,  18,  57,  75,  25, -75, -87, -18,  70,  89,   9, -50, -80,
+ 36,  25, -83, -70,  83,  90, -36, -80, -36,  43,  83,   9, -83, -57,  36,  87,
+ 18,   9, -50, -25,  75,  43, -89, -57,  89,  70, -75, -80,  50,  87, -18, -90,
+ 64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
+-89,  25, -75,  70, -50,  90, -18,  80,  18,  43,  50,  -9,  75, -57,  89, -87,
+ 83, -43,  36, -90, -36, -57, -83,  25, -83,  87, -36,  70,  36,  -9,  83, -80,
+-75,  57,  18,  80,  89, -25,  50, -90, -50,  -9, -89,  87, -18,  43,  75, -70,
+ 64, -70, -64, -43, -64,  87,  64,   9,  64, -90, -64,  25, -64,  80,  64, -57,
+-50,  80,  89,  -9, -18, -70, -75,  87,  75, -25,  18, -57, -89,  90,  50, -43,
+ 36, -87, -83,  57,  83,  -9, -36, -43, -36,  80,  83, -90, -83,  70,  36, -25,
+-18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_16x1_coeff_hor[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
+ 40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
+ 68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
+ 85,  88,  55,  25, -48, -81, -87, -48,  -8,  68,  81,  68,  62, -48, -40, -81,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  87,  33, -25, -88, -85,   8,  33,  85,  // 8
+ 81,  73, -68, -85, -25,  25,  88,  55, -48, -88, -48,  48,  88,  33, -25, -87,
+ 62,  48, -88, -81,  68,  88,  -8, -68, -55,  25,  88,  25, -73, -68,  17,  88,
+ 33,  17, -62, -33,  81,  48, -88, -62,  85,  73, -68, -81,  40,  87,  -8, -88,
+ 68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
+-81, -25, -88,  33, -77,  77, -48,  88,  -8,  62,  33,   8,  68, -48,  87, -85,
+ 88,   0,  68, -77,   0, -77, -68,   0, -88,  77, -48,  77,  25,   0,  81, -77,
+-88,  25, -17,  88,  77,   0,  68, -88, -33, -25, -88,  81, -25,  48,  73, -68,
+ 81, -48, -40, -62, -77,  77,  48,  25,  73, -88, -55,  17, -68,  81,  62, -55,
+-68,  68,  81,   8,   0, -77, -81,  81,  68, -17,  25, -62, -88,  88,  48, -40,
+ 48, -81, -87,  48,  77,   0, -25, -48, -40,  81,  85, -88, -81,  68,  33, -25,
+-25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+ALIGNED(32) static const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;
+
+
+          static const int16_t* fi_dct2_16x16_coeff_ver = fi_dct2_16x16_coeff_hor;
+
+          static const int16_t* fi_dst7_16x16_coeff_ver = fi_dst7_16x16_coeff_hor;
+
+          static const int16_t* fi_dct8_16x16_coeff_ver = ff_dct8_16x16_coeff_ver;
+
+
+ALIGNED(32) static const int16_t  ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  // 8
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 16
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,  // 24
+-85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  // 32
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,  // 40
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  // 48
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  // 56
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  // 64
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  // 72
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  // 80
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,  // 88
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  // 96
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,  // 104
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  // 112
+-54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 120
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  // 128
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  // 136
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  // 144
+-85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,  // 152
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  // 160
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  // 168
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  // 176
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+-67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  // 184
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  // 192
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,  // 200
+-22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  // 208
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+ -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,
+ 46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,  // 216
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  // 224
+-38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38,
+ 61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,
+-78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78,
+ 88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  // 232
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+  4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,   4,  -4,  // 240
+-13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13, -13,  13,
+ 22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,  22, -22,
+-31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31, -31,  31,
+ 38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,  38, -38,
+-46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46, -46,  46,
+ 54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,  54, -54,
+-61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61, -61,  61,
+ 67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  67, -67,  // 248
+-73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73, -73,  73,
+ 78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,  78, -78,
+-82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82, -82,  82,
+ 85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,  85, -85,
+-88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88, -88,  88,
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,
+-90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90,
+};
+
+ALIGNED(32) static const int16_t  ff_dct2_16x32_coeff_ver[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_16x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_16x32_coeff_ver[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+
+          static const int16_t* fi_dct2_16x32_coeff_hor = fi_dct2_16x16_coeff_hor;
+
+          static const int16_t* fi_dst7_16x32_coeff_hor = fi_dst7_16x16_coeff_hor;
+
+          static const int16_t* fi_dct8_16x32_coeff_hor = ff_dct8_16x16_coeff_ver;
+
+// 32xN
+ALIGNED(32) static const int16_t  ff_dct2_32xN_coeff_hor[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_32xN_coeff_hor[1024] = {
+ 4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,  // 0
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,  63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,  56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,  // 2
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,  66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,  // 8
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,  56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,  34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,  // 10
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,  72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,  // 4
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78, -60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42, -46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,  // 6
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56, -68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60, -85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,  // 12
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34, -53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74, -21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,  // 14
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87, -74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,  // 16
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,  50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,   9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,  // 18
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,  77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,  // 24
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,  42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89, -17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,  // 26
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,  80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21, -90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,  // 20
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26, -46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,   4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,  // 22
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80, -78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21, -86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,  // 28
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74, -38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,  30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,  // 30
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38, -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_32xN_coeff_hor[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,  84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,  // 8
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,  26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42, -63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,  // 10
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,  86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,  // 4
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90, -30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,  53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,  // 6
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21, -85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84, -60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,  // 12
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66, -21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,  72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,  // 14
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72, -87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,  // 16
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,  17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0, -78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,  // 18
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,  88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,  // 24
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,   9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42, -87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,  // 26
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,  90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89, -38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,  // 20
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13, -13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,  84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,  // 22
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90, -89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74, -13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,  // 28
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,  -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dct2_32xN_coeff_hor[1024] = {
+64,  90,  64,  90,  64,  88,  64,  85,  64,  82,  64,  78,  64,  73,  64,  67,  64,  61,  64,  54,  64,  46,  64,  38,  64,  31,  64,  22,  64,  13,  64,   4,  // 0
+ 64,  -4,  64, -13,  64, -22,  64, -31,  64, -38,  64, -46,  64, -54,  64, -61,  64, -67,  64, -73,  64, -78,  64, -82,  64, -85,  64, -88,  64, -90,  64, -90,
+ 90,  90,  87,  82,  80,  67,  70,  46,  57,  22,  43,  -4,  25, -31,   9, -54,  -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13,  // 2
+-90,  13, -87,  38, -80,  61, -70,  78, -57,  88, -43,  90, -25,  85,  -9,  73,   9,  54,  25,  31,  43,   4,  57, -22,  70, -46,  80, -67,  87, -82,  90, -90,
+ 89,  88,  75,  67,  50,  31,  18, -13, -18, -54, -50, -82, -75, -90, -89, -78, -89, -46, -75,  -4, -50,  38, -18,  73,  18,  90,  50,  85,  75,  61,  89,  22,  // 4
+ 89, -22,  75, -61,  50, -85,  18, -90, -18, -73, -50, -38, -75,   4, -89,  46, -89,  78, -75,  90, -50,  82, -18,  54,  18,  13,  50, -31,  75, -67,  89, -88,
+ 87,  85,  57,  46,   9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25,  38,  25,  82,  70,  88,  90,  54,  80,  -4,  43, -61,  -9, -90, -57, -78, -87, -31,  // 6
+-87,  31, -57,  78,  -9,  90,  43,  61,  80,   4,  90, -54,  70, -88,  25, -82, -25, -38, -70,  22, -90,  73, -80,  90, -43,  67,   9,  13,  57, -46,  87, -85,
+ 83,  82,  36,  22, -36, -54, -83, -90, -83, -61, -36,  13,  36,  78,  83,  85,  83,  31,  36, -46, -36, -90, -83, -67, -83,   4, -36,  73,  36,  88,  83,  38,  // 8
+ 83, -38,  36, -88, -36, -73, -83,  -4, -83,  67, -36,  90,  36,  46,  83, -31,  83, -85,  36, -78, -36, -13, -83,  61, -83,  90, -36,  54,  36, -22,  83, -82,
+ 80,  78,   9,  -4, -70, -82, -87, -73, -25,  13,  57,  85,  90,  67,  43, -22, -43, -88, -90, -61, -57,  31,  25,  90,  87,  54,  70, -38,  -9, -90, -80, -46,  // 10
+-80,  46,  -9,  90,  70,  38,  87, -54,  25, -90, -57, -31, -90,  61, -43,  88,  43,  22,  90, -67,  57, -85, -25, -13, -87,  73, -70,  82,   9,   4,  80, -78,
+ 75,  73, -18, -31, -89, -90, -50, -22,  50,  78,  89,  67,  18, -38, -75, -90, -75, -13,  18,  82,  89,  61,  50, -46, -50, -88, -89,  -4, -18,  85,  75,  54,  // 12
+ 75, -54, -18, -85, -89,   4, -50,  88,  50,  46,  89, -61,  18, -82, -75,  13, -75,  90,  18,  38,  89, -67,  50, -78, -50,  22, -89,  90, -18,  31,  75, -73,
+ 70,  67, -43, -54, -87, -78,   9,  38,  90,  85,  25, -22, -80, -90, -57,   4,  57,  90,  80,  13, -25, -88, -90, -31,  -9,  82,  87,  46,  43, -73, -70, -61,  // 14
+-70,  61,  43,  73,  87, -46,  -9, -82, -90,  31, -25,  88,  80, -13,  57, -90, -57,  -4, -80,  90,  25,  22,  90, -85,   9, -38, -87,  78, -43,  54,  70, -67,
+ 64,  61, -64, -73, -64, -46,  64,  82,  64,  31, -64, -88, -64, -13,  64,  90,  64,  -4, -64, -90, -64,  22,  64,  85,  64, -38, -64, -78, -64,  54,  64,  67,  // 16
+ 64, -67, -64, -54, -64,  78,  64,  38,  64, -85, -64, -22, -64,  90,  64,   4,  64, -90, -64,  13, -64,  88,  64, -31,  64, -82, -64,  46, -64,  73,  64, -61,
+ 57,  54, -80, -85, -25,  -4,  90,  88,  -9, -46, -87, -61,  43,  82,  70,  13, -70, -90, -43,  38,  87,  67,   9, -78, -90, -22,  25,  90,  80, -31, -57, -73,  // 18
+-57,  73,  80,  31,  25, -90, -90,  22,   9,  78,  87, -67, -43, -38, -70,  90,  70, -13,  43, -82, -87,  61,  -9,  46,  90, -88, -25,   4, -80,  85,  57, -54,
+ 50,  46, -89, -90,  18,  38,  75,  54, -75, -90, -18,  31,  89,  61, -50, -88, -50,  22,  89,  67, -18, -85, -75,  13,  75,  73,  18, -82, -89,   4,  50,  78,  // 20
+ 50, -78, -89,  -4,  18,  82,  75, -73, -75, -13, -18,  85,  89, -67, -50, -22, -50,  88,  89, -61, -18, -31, -75,  90,  75, -54,  18, -38, -89,  90,  50, -46,
+ 43,  38, -90, -88,  57,  73,  25,  -4, -87, -67,  70,  90,   9, -46, -80, -31,  80,  85,  -9, -78, -70,  13,  87,  61, -25, -90, -57,  54,  90,  22, -43, -82,  // 22
+-43,  82,  90, -22, -57, -54, -25,  90,  87, -61, -70, -13,  -9,  78,  80, -85, -80,  31,   9,  46,  70, -90, -87,  67,  25,   4,  57, -73, -90,  88,  43, -38,
+ 36,  31, -83, -78,  83,  90, -36, -61, -36,   4,  83,  54, -83, -88,  36,  82,  36, -38, -83, -22,  83,  73, -36, -90, -36,  67,  83, -13, -83, -46,  36,  85,  // 24
+ 36, -85, -83,  46,  83,  13, -36, -67, -36,  90,  83, -73, -83,  22,  36,  38,  36, -82, -83,  88,  83, -54, -36,  -4, -36,  61,  83, -90, -83,  78,  36, -31,
+ 25,  22, -70, -61,  90,  85, -80, -90,  43,  73,   9, -38, -57,  -4,  87,  46, -87, -78,  57,  90,  -9, -82, -43,  54,  80, -13, -90, -31,  70,  67, -25, -88,  // 26
+-25,  88,  70, -67, -90,  31,  80,  13, -43, -54,  -9,  82,  57, -90, -87,  78,  87, -46, -57,   4,   9,  38,  43, -73, -80,  90,  90, -85, -70,  61,  25, -22,
+ 18,  13, -50, -38,  75,  61, -89, -78,  89,  88, -75, -90,  50,  85, -18, -73, -18,  54,  50, -31, -75,   4,  89,  22, -89, -46,  75,  67, -50, -82,  18,  90,  // 28
+ 18, -90, -50,  82,  75, -67, -89,  46,  89, -22, -75,  -4,  50,  31, -18, -54, -18,  73,  50, -85, -75,  90,  89, -88, -89,  78,  75, -61, -50,  38,  18, -13,
+  9,   4, -25, -13,  43,  22, -57, -31,  70,  38, -80, -46,  87,  54, -90, -61,  90,  67, -87, -73,  80,  78, -70, -82,  57,  85, -43, -88,  25,  90,  -9, -90,  // 30
+ -9,  90,  25, -90, -43,  88,  57, -85, -70,  82,  80, -78, -87,  73,  90, -67, -90,  61,  87, -54, -80,  46,  70, -38, -57,  31,  43, -22, -25,  13,   9,  -4,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dst7_32xN_coeff_hor[1024] = {
+ 4,  13,   9,  26,  13,  38,  17,  50,  21,  60,  26,  68,  30,  77,  34,  82,  38,  86,  42,  89,  46,  90,  50,  88,  53,  85,  56,  80,  60,  74,  63,  66,  // 0
+ 66,  56,  68,  46,  72,  34,  74,  21,  77,   9,  78,  -4,  80, -17,  82, -30,  84, -42,  85, -53,  86, -63,  87, -72,  88, -78,  89, -84,  90, -87,  90, -90,
+ 21,  30,  42,  56,  60,  77,  74,  87,  84,  89,  89,  80,  89,  63,  84,  38,  74,   9,  60, -21,  42, -50,  21, -72,   0, -85, -21, -90, -42, -84, -60, -68,  // 2
+-74, -46, -84, -17, -89,  13, -89,  42, -84,  66, -74,  82, -60,  90, -42,  86, -21,  74,   0,  53,  21,  26,  42,  -4,  60, -34,  74, -60,  84, -78,  89, -88,
+ 38,  46,  68,  78,  86,  90,  88,  77,  74,  42,  46,  -4,   9, -50, -30, -80, -63, -90, -84, -74, -90, -38, -78,   9, -53,  53, -17,  82,  21,  89,  56,  72,  // 4
+ 80,  34,  90, -13,  82, -56,  60, -84,  26, -88, -13, -68, -50, -30, -77,  17, -89,  60, -85,  85, -66,  87, -34,  66,   4,  26,  42, -21,  72, -63,  87, -86,
+ 53,  60,  85,  89,  85,  74,  53,  21,   0, -42, -53, -84, -85, -84, -85, -42, -53,  21,   0,  74,  53,  89,  85,  60,  85,   0,  53, -60,   0, -89, -53, -74,  // 6
+-85, -21, -85,  42, -53,  84,   0,  84,  53,  42,  85, -21,  85, -74,  53, -89,   0, -60, -53,   0, -85,  60, -85,  89, -53,  74,   0,  21,  53, -42,  85, -84,
+ 66,  72,  90,  86,  56,  34, -13, -46, -74, -89, -87, -63, -46,  13,  26,  78,  80,  82,  84,  21,  34, -56, -38, -90, -85, -53, -78,  26, -21,  84,  50,  77,  // 8
+ 88,   9,  72, -66,   9, -88, -60, -42, -90,  38, -63,  87,   4,  68,  68,  -4,  89, -74,  53, -85, -17, -30, -77,  50, -86,  90, -42,  60,  30, -17,  82, -80,
+ 77,  80,  80,  72,   9, -17, -72, -86, -84, -60, -17,  34,  66,  90,  86,  46,  26, -50, -60, -89, -88, -30, -34,  63,  53,  85,  90,  13,  42, -74, -46, -78,  // 10
+-90,   4, -50,  82,  38,  68,  89, -21,  56, -87, -30, -56, -87,  38, -63,  90,  21,  42,  85, -53,  68, -88, -13, -26, -82,  66, -74,  84,   4,   9,  78, -77,
+ 84,  86,  60,  46, -42, -63, -89, -78, -21,  21,  74,  90,  74,  26, -21, -77, -89, -66, -42,  42,  60,  87,  84,   4,   0, -85, -84, -50, -60,  60,  42,  80,  // 12
+ 89, -17,  21, -90, -74, -30, -74,  74,  21,  68,  89, -38,  42, -88, -60,  -9, -84,  84,   0,  53,  84, -56,  60, -82, -42,  13, -89,  89, -21,  34,  74, -72,
+ 88,  90,  30,  13, -78, -87, -56, -26,  60,  84,  77,  38, -34, -78, -87, -50,   4,  72,  89,  60,  26, -63, -80, -68, -53,  53,  63,  77,  74, -42, -38, -82,  // 14
+-86,  30,   9,  86,  90, -17,  21, -89, -82,   4, -50,  90,  66,   9,  72, -88, -42, -21, -85,  85,  13,  34,  90, -80,  17, -46, -84,  74, -46,  56,  68, -66,
+ 90,  89,  -4, -21, -90, -84,   9,  42,  89,  74, -13, -60, -88, -60,  17,  74,  87,  42, -21, -84, -86, -21,  26,  89,  85,   0, -30, -89, -84,  21,  34,  84,  // 16
+ 82, -42, -38, -74, -80,  60,  42,  60,  78, -74, -46, -42, -77,  84,  50,  21,  74, -89, -53,   0, -72,  89,  56, -21,  68, -84, -60,  42, -66,  74,  63, -60,
+ 87,  85, -38, -53, -72, -53,  68,  85,  42,   0, -86, -85,  -4,  53,  88,  53, -34, -85, -74,   0,  66,  85,  46, -53, -85, -53,  -9,  85,  89,   0, -30, -85,  // 18
+-77,  53,  63,  53,  50, -85, -84,   0, -13,  85,  90, -53, -26, -53, -78,  85,  60,   0,  53, -85, -82,  53, -17,  53,  90, -85, -21,   0, -80,  85,  56, -53,
+ 82,  78, -66, -77, -30,  -4,  90,  80, -42, -74, -56,  -9,  86,  82, -13, -72, -77, -13,  74,  84,  17, -68, -87, -17,  53,  85,  46, -66, -89, -21,  26,  86,  // 20
+ 68, -63, -80, -26,  -4,  87,  84, -60, -63, -30, -34,  88,  90, -56, -38, -34, -60,  89,  85, -53,  -9, -38, -78,  90,  72, -50,  21, -42, -88,  90,  50, -46,
+ 74,  68, -84, -88,  21,  46,  60,  30, -89, -84,  42,  78,  42, -17, -89, -56,  60,  90,  21, -60, -84, -13,  74,  77,   0, -85, -74,  34,  84,  42, -21, -87,  // 22
+-60,  72,  89,  -4, -42, -66, -42,  89,  89, -50, -60, -26, -21,  82,  84, -80, -74,  21,   0,  53,  74, -90, -84,  63,  21,   9,  60, -74, -89,  86,  42, -38,
+ 63,  56, -90, -87,  66,  80,  -4, -38, -60, -21,  90,  72, -68, -90,   9,  68,  56, -17, -89, -42,  72,  82, -13, -86, -53,  53,  88,   4, -74, -60,  17,  88,  // 24
+ 50, -78, -87,  34,  77,  26, -21, -74, -46,  90,  86, -66, -78,  13,  26,  46,  42, -84, -85,  85,  80, -50, -30,  -9, -38,  63,  84, -89, -82,  77,  34, -30,
+ 50,  42, -82, -74,  88,  89, -66, -84,  21,  60,  30, -21, -72, -21,  90,  60, -78, -84,  42,  89,   9, -74, -56,  42,  85,   0, -86, -42,  60,  74, -13, -89,  // 26
+-38,  84,  77, -60, -90,  21,  74,  21, -34, -60, -17,  84,  63, -89, -87,  74,  84, -42, -53,   0,   4,  42,  46, -74, -80,  89,  89, -84, -68,  60,  26, -21,
+ 34,  26, -63, -50,  82,  68, -90, -82,  84,  89, -66, -88,  38,  80,  -4, -66, -30,  46,  60, -21, -80,  -4,  90,  30, -85, -53,  68,  72, -42, -84,   9,  90,  // 28
+ 26, -87, -56,  78,  78, -63, -89,  42,  86, -17, -72,  -9,  46,  34, -13, -56, -21,  74,  53, -85, -77,  90,  88, -86, -87,  77,  74, -60, -50,  38,  17, -13,
+ 17,   9, -34, -17,  50,  26, -63, -34,  74,  42, -82, -50,  87,  56, -90, -63,  88,  68, -84, -74,  77,  78, -66, -82,  53,  85, -38, -87,  21,  89,  -4, -90,  // 30
+-13,  90,  30, -88, -46,  86,  60, -84, -72,  80,  80, -77, -86,  72,  90, -66, -89,  60,  85, -53, -78,  46,  68, -38, -56,  30,  42, -21, -26,  13,   9,  -4,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct8_32xN_coeff_hor[1024] = {
+90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,  84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,  // 4
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90, -30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,  53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,  // 6
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21, -85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,  // 8
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,  26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42, -63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,  // 10
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,  86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84, -60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,  // 12
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66, -21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,  72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,  // 14
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72, -87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,  // 16
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,  17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0, -78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,  // 18
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,  88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89, -38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,  // 20
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13, -13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,  84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,  // 22
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90, -89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,  // 24
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,   9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42, -87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,  // 26
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,  90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74, -13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,  // 28
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,  -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+static const int16_t ff_dct8_4x32_coeff_ver[1024] = {
+90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,  // 0
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,  34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,  // 2
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,  84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,  // 4
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90, -30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,  53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,  // 6
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21, -85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,  // 8
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,  26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42, -63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,  // 10
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,  86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84, -60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,  // 12
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66, -21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,  72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,  // 14
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72, -87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,  // 16
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,  17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0, -78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,  // 18
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,  88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89, -38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,  // 20
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13, -13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,  84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,  // 22
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90, -89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,  // 24
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,   9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42, -87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,  // 26
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,  90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74, -13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,  // 28
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,  -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,  90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,  // 30
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68, -90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+static const int16_t ff_dst7_4x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,  // 0
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,  63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,  56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,  // 2
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,  66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,  // 4
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78, -60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42, -46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,  // 6
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56, -68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,  // 8
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,  56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,  34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,  // 10
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,  72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60, -85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,  // 12
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34, -53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74, -21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,  // 14
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87, -74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,  // 16
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,  50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,   9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,  // 18
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,  77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21, -90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,  // 20
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26, -46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,   4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,  // 22
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80, -78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,  // 24
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,  42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89, -17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,  // 26
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,  80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21, -86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,  // 28
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74, -38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,  30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,  // 30
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38, -82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+          static const int16_t* ff_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
+
+
+          static const int16_t* fi_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor;
+
+
+ALIGNED(32) static const int16_t  ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = {
+ 90,  90,  87,  87,  90,  90,  87,  87,  90,  90,  87,  87,  90,  90,  87,  87,  // 0
+ 80,  80,  70,  70,  80,  80,  70,  70,  80,  80,  70,  70,  80,  80,  70,  70,
+ 57,  57,  43,  43,  57,  57,  43,  43,  57,  57,  43,  43,  57,  57,  43,  43,
+ 25,  25,   9,   9,  25,  25,   9,   9,  25,  25,   9,   9,  25,  25,   9,   9,
+ 87,  87,  57,  57,  87,  87,  57,  57,  87,  87,  57,  57,  87,  87,  57,  57,
+  9,   9, -43, -43,   9,   9, -43, -43,   9,   9, -43, -43,   9,   9, -43, -43,
+-80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90,
+-70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25,
+ 80,  80,   9,   9,  80,  80,   9,   9,  80,  80,   9,   9,  80,  80,   9,   9,  // 8
+-70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87,
+-25, -25,  57,  57, -25, -25,  57,  57, -25, -25,  57,  57, -25, -25,  57,  57,
+ 90,  90,  43,  43,  90,  90,  43,  43,  90,  90,  43,  43,  90,  90,  43,  43,
+ 70,  70, -43, -43,  70,  70, -43, -43,  70,  70, -43, -43,  70,  70, -43, -43,
+-87, -87,   9,   9, -87, -87,   9,   9, -87, -87,   9,   9, -87, -87,   9,   9,
+ 90,  90,  25,  25,  90,  90,  25,  25,  90,  90,  25,  25,  90,  90,  25,  25,
+-80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57,
+ 57,  57, -80, -80,  57,  57, -80, -80,  57,  57, -80, -80,  57,  57, -80, -80,  // 16
+-25, -25,  90,  90, -25, -25,  90,  90, -25, -25,  90,  90, -25, -25,  90,  90,
+ -9,  -9, -87, -87,  -9,  -9, -87, -87,  -9,  -9, -87, -87,  -9,  -9, -87, -87,
+ 43,  43,  70,  70,  43,  43,  70,  70,  43,  43,  70,  70,  43,  43,  70,  70,
+ 43,  43, -90, -90,  43,  43, -90, -90,  43,  43, -90, -90,  43,  43, -90, -90,
+ 57,  57,  25,  25,  57,  57,  25,  25,  57,  57,  25,  25,  57,  57,  25,  25,
+-87, -87,  70,  70, -87, -87,  70,  70, -87, -87,  70,  70, -87, -87,  70,  70,
+  9,   9, -80, -80,   9,   9, -80, -80,   9,   9, -80, -80,   9,   9, -80, -80,
+ 25,  25, -70, -70,  25,  25, -70, -70,  25,  25, -70, -70,  25,  25, -70, -70,  // 24
+ 90,  90, -80, -80,  90,  90, -80, -80,  90,  90, -80, -80,  90,  90, -80, -80,
+ 43,  43,   9,   9,  43,  43,   9,   9,  43,  43,   9,   9,  43,  43,   9,   9,
+-57, -57,  87,  87, -57, -57,  87,  87, -57, -57,  87,  87, -57, -57,  87,  87,
+  9,   9, -25, -25,   9,   9, -25, -25,   9,   9, -25, -25,   9,   9, -25, -25,
+ 43,  43, -57, -57,  43,  43, -57, -57,  43,  43, -57, -57,  43,  43, -57, -57,
+ 70,  70, -80, -80,  70,  70, -80, -80,  70,  70, -80, -80,  70,  70, -80, -80,
+ 87,  87, -90, -90,  87,  87, -90, -90,  87,  87, -90, -90,  87,  87, -90, -90,
+};
+
+ALIGNED(32) static const int16_t  ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand
+ 90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  90, -90,  // 0
+ 88, -88,  85, -85,  88, -88,  85, -85,  88, -88,  85, -85,  88, -88,  85, -85,
+ 82, -82,  78, -78,  82, -82,  78, -78,  82, -82,  78, -78,  82, -82,  78, -78,
+ 73, -73,  67, -67,  73, -73,  67, -67,  73, -73,  67, -67,  73, -73,  67, -67,
+ 61, -61,  54, -54,  61, -61,  54, -54,  61, -61,  54, -54,  61, -61,  54, -54,
+ 46, -46,  38, -38,  46, -46,  38, -38,  46, -46,  38, -38,  46, -46,  38, -38,
+ 31, -31,  22, -22,  31, -31,  22, -22,  31, -31,  22, -22,  31, -31,  22, -22,
+ 13, -13,   4,  -4,  13, -13,   4,  -4,  13, -13,   4,  -4,  13, -13,   4,  -4,
+ 90, -90,  82, -82,  90, -90,  82, -82,  90, -90,  82, -82,  90, -90,  82, -82,  // 8
+ 67, -67,  46, -46,  67, -67,  46, -46,  67, -67,  46, -46,  67, -67,  46, -46,
+ 22, -22,  -4,   4,  22, -22,  -4,   4,  22, -22,  -4,   4,  22, -22,  -4,   4,
+-31,  31, -54,  54, -31,  31, -54,  54, -31,  31, -54,  54, -31,  31, -54,  54,
+-73,  73, -85,  85, -73,  73, -85,  85, -73,  73, -85,  85, -73,  73, -85,  85,
+-90,  90, -88,  88, -90,  90, -88,  88, -90,  90, -88,  88, -90,  90, -88,  88,
+-78,  78, -61,  61, -78,  78, -61,  61, -78,  78, -61,  61, -78,  78, -61,  61,
+-38,  38, -13,  13, -38,  38, -13,  13, -38,  38, -13,  13, -38,  38, -13,  13,
+ 88, -88,  67, -67,  88, -88,  67, -67,  88, -88,  67, -67,  88, -88,  67, -67,  // 16
+ 31, -31, -13,  13,  31, -31, -13,  13,  31, -31, -13,  13,  31, -31, -13,  13,
+-54,  54, -82,  82, -54,  54, -82,  82, -54,  54, -82,  82, -54,  54, -82,  82,
+-90,  90, -78,  78, -90,  90, -78,  78, -90,  90, -78,  78, -90,  90, -78,  78,
+-46,  46,  -4,   4, -46,  46,  -4,   4, -46,  46,  -4,   4, -46,  46,  -4,   4,
+ 38, -38,  73, -73,  38, -38,  73, -73,  38, -38,  73, -73,  38, -38,  73, -73,
+ 90, -90,  85, -85,  90, -90,  85, -85,  90, -90,  85, -85,  90, -90,  85, -85,
+ 61, -61,  22, -22,  61, -61,  22, -22,  61, -61,  22, -22,  61, -61,  22, -22,
+ 85, -85,  46, -46,  85, -85,  46, -46,  85, -85,  46, -46,  85, -85,  46, -46,  // 24
+-13,  13, -67,  67, -13,  13, -67,  67, -13,  13, -67,  67, -13,  13, -67,  67,
+-90,  90, -73,  73, -90,  90, -73,  73, -90,  90, -73,  73, -90,  90, -73,  73,
+-22,  22,  38, -38, -22,  22,  38, -38, -22,  22,  38, -38, -22,  22,  38, -38,
+ 82, -82,  88, -88,  82, -82,  88, -88,  82, -82,  88, -88,  82, -82,  88, -88,
+ 54, -54,  -4,   4,  54, -54,  -4,   4,  54, -54,  -4,   4,  54, -54,  -4,   4,
+-61,  61, -90,  90, -61,  61, -90,  90, -61,  61, -90,  90, -61,  61, -90,  90,
+-78,  78, -31,  31, -78,  78, -31,  31, -78,  78, -31,  31, -78,  78, -31,  31,
+ 82, -82,  22, -22,  82, -82,  22, -22,  82, -82,  22, -22,  82, -82,  22, -22,  // 32
+-54,  54, -90,  90, -54,  54, -90,  90, -54,  54, -90,  90, -54,  54, -90,  90,
+-61,  61,  13, -13, -61,  61,  13, -13, -61,  61,  13, -13, -61,  61,  13, -13,
+ 78, -78,  85, -85,  78, -78,  85, -85,  78, -78,  85, -85,  78, -78,  85, -85,
+ 31, -31, -46,  46,  31, -31, -46,  46,  31, -31, -46,  46,  31, -31, -46,  46,
+-90,  90, -67,  67, -90,  90, -67,  67, -90,  90, -67,  67, -90,  90, -67,  67,
+  4,  -4,  73, -73,   4,  -4,  73, -73,   4,  -4,  73, -73,   4,  -4,  73, -73,
+ 88, -88,  38, -38,  88, -88,  38, -38,  88, -88,  38, -38,  88, -88,  38, -38,
+ 78, -78,  -4,   4,  78, -78,  -4,   4,  78, -78,  -4,   4,  78, -78,  -4,   4,  // 40
+-82,  82, -73,  73, -82,  82, -73,  73, -82,  82, -73,  73, -82,  82, -73,  73,
+ 13, -13,  85, -85,  13, -13,  85, -85,  13, -13,  85, -85,  13, -13,  85, -85,
+ 67, -67, -22,  22,  67, -67, -22,  22,  67, -67, -22,  22,  67, -67, -22,  22,
+-88,  88, -61,  61, -88,  88, -61,  61, -88,  88, -61,  61, -88,  88, -61,  61,
+ 31, -31,  90, -90,  31, -31,  90, -90,  31, -31,  90, -90,  31, -31,  90, -90,
+ 54, -54, -38,  38,  54, -54, -38,  38,  54, -54, -38,  38,  54, -54, -38,  38,
+-90,  90, -46,  46, -90,  90, -46,  46, -90,  90, -46,  46, -90,  90, -46,  46,
+ 73, -73, -31,  31,  73, -73, -31,  31,  73, -73, -31,  31,  73, -73, -31,  31,  // 48
+-90,  90, -22,  22, -90,  90, -22,  22, -90,  90, -22,  22, -90,  90, -22,  22,
+ 78, -78,  67, -67,  78, -78,  67, -67,  78, -78,  67, -67,  78, -78,  67, -67,
+-38,  38, -90,  90, -38,  38, -90,  90, -38,  38, -90,  90, -38,  38, -90,  90,
+-13,  13,  82, -82, -13,  13,  82, -82, -13,  13,  82, -82, -13,  13,  82, -82,
+ 61, -61, -46,  46,  61, -61, -46,  46,  61, -61, -46,  46,  61, -61, -46,  46,
+-88,  88,  -4,   4, -88,  88,  -4,   4, -88,  88,  -4,   4, -88,  88,  -4,   4,
+ 85, -85,  54, -54,  85, -85,  54, -54,  85, -85,  54, -54,  85, -85,  54, -54,
+ 67, -67, -54,  54,  67, -67, -54,  54,  67, -67, -54,  54,  67, -67, -54,  54,  // 56
+-78,  78,  38, -38, -78,  78,  38, -38, -78,  78,  38, -38, -78,  78,  38, -38,
+ 85, -85, -22,  22,  85, -85, -22,  22,  85, -85, -22,  22,  85, -85, -22,  22,
+-90,  90,   4,  -4, -90,  90,   4,  -4, -90,  90,   4,  -4, -90,  90,   4,  -4,
+ 90, -90,  13, -13,  90, -90,  13, -13,  90, -90,  13, -13,  90, -90,  13, -13,
+-88,  88, -31,  31, -88,  88, -31,  31, -88,  88, -31,  31, -88,  88, -31,  31,
+ 82, -82,  46, -46,  82, -82,  46, -46,  82, -82,  46, -46,  82, -82,  46, -46,
+-73,  73, -61,  61, -73,  73, -61,  61, -73,  73, -61,  61, -73,  73, -61,  61,
+ 61, -61, -73,  73,  61, -61, -73,  73,  61, -61, -73,  73,  61, -61, -73,  73,  // 64
+-46,  46,  82, -82, -46,  46,  82, -82, -46,  46,  82, -82, -46,  46,  82, -82,
+ 31, -31, -88,  88,  31, -31, -88,  88,  31, -31, -88,  88,  31, -31, -88,  88,
+-13,  13,  90, -90, -13,  13,  90, -90, -13,  13,  90, -90, -13,  13,  90, -90,
+ -4,   4, -90,  90,  -4,   4, -90,  90,  -4,   4, -90,  90,  -4,   4, -90,  90,
+ 22, -22,  85, -85,  22, -22,  85, -85,  22, -22,  85, -85,  22, -22,  85, -85,
+-38,  38, -78,  78, -38,  38, -78,  78, -38,  38, -78,  78, -38,  38, -78,  78,
+ 54, -54,  67, -67,  54, -54,  67, -67,  54, -54,  67, -67,  54, -54,  67, -67,
+ 54, -54, -85,  85,  54, -54, -85,  85,  54, -54, -85,  85,  54, -54, -85,  85,  // 72
+ -4,   4,  88, -88,  -4,   4,  88, -88,  -4,   4,  88, -88,  -4,   4,  88, -88,
+-46,  46, -61,  61, -46,  46, -61,  61, -46,  46, -61,  61, -46,  46, -61,  61,
+ 82, -82,  13, -13,  82, -82,  13, -13,  82, -82,  13, -13,  82, -82,  13, -13,
+-90,  90,  38, -38, -90,  90,  38, -38, -90,  90,  38, -38, -90,  90,  38, -38,
+ 67, -67, -78,  78,  67, -67, -78,  78,  67, -67, -78,  78,  67, -67, -78,  78,
+-22,  22,  90, -90, -22,  22,  90, -90, -22,  22,  90, -90, -22,  22,  90, -90,
+-31,  31, -73,  73, -31,  31, -73,  73, -31,  31, -73,  73, -31,  31, -73,  73,
+ 46, -46, -90,  90,  46, -46, -90,  90,  46, -46, -90,  90,  46, -46, -90,  90,  // 80
+ 38, -38,  54, -54,  38, -38,  54, -54,  38, -38,  54, -54,  38, -38,  54, -54,
+-90,  90,  31, -31, -90,  90,  31, -31, -90,  90,  31, -31, -90,  90,  31, -31,
+ 61, -61, -88,  88,  61, -61, -88,  88,  61, -61, -88,  88,  61, -61, -88,  88,
+ 22, -22,  67, -67,  22, -22,  67, -67,  22, -22,  67, -67,  22, -22,  67, -67,
+-85,  85,  13, -13, -85,  85,  13, -13, -85,  85,  13, -13, -85,  85,  13, -13,
+ 73, -73, -82,  82,  73, -73, -82,  82,  73, -73, -82,  82,  73, -73, -82,  82,
+  4,  -4,  78, -78,   4,  -4,  78, -78,   4,  -4,  78, -78,   4,  -4,  78, -78,
+ 38, -38, -88,  88,  38, -38, -88,  88,  38, -38, -88,  88,  38, -38, -88,  88,  // 88
+ 73, -73,  -4,   4,  73, -73,  -4,   4,  73, -73,  -4,   4,  73, -73,  -4,   4,
+-67,  67,  90, -90, -67,  67,  90, -90, -67,  67,  90, -90, -67,  67,  90, -90,
+-46,  46, -31,  31, -46,  46, -31,  31, -46,  46, -31,  31, -46,  46, -31,  31,
+ 85, -85, -78,  78,  85, -85, -78,  78,  85, -85, -78,  78,  85, -85, -78,  78,
+ 13, -13,  61, -61,  13, -13,  61, -61,  13, -13,  61, -61,  13, -13,  61, -61,
+-90,  90,  54, -54, -90,  90,  54, -54, -90,  90,  54, -54, -90,  90,  54, -54,
+ 22, -22, -82,  82,  22, -22, -82,  82,  22, -22, -82,  82,  22, -22, -82,  82,
+ 31, -31, -78,  78,  31, -31, -78,  78,  31, -31, -78,  78,  31, -31, -78,  78,  // 96
+ 90, -90, -61,  61,  90, -90, -61,  61,  90, -90, -61,  61,  90, -90, -61,  61,
+  4,  -4,  54, -54,   4,  -4,  54, -54,   4,  -4,  54, -54,   4,  -4,  54, -54,
+-88,  88,  82, -82, -88,  88,  82, -82, -88,  88,  82, -82, -88,  88,  82, -82,
+-38,  38, -22,  22, -38,  38, -22,  22, -38,  38, -22,  22, -38,  38, -22,  22,
+ 73, -73, -90,  90,  73, -73, -90,  90,  73, -73, -90,  90,  73, -73, -90,  90,
+ 67, -67, -13,  13,  67, -67, -13,  13,  67, -67, -13,  13,  67, -67, -13,  13,
+-46,  46,  85, -85, -46,  46,  85, -85, -46,  46,  85, -85, -46,  46,  85, -85,
+ 22, -22, -61,  61,  22, -22, -61,  61,  22, -22, -61,  61,  22, -22, -61,  61,  // 104
+ 85, -85, -90,  90,  85, -85, -90,  90,  85, -85, -90,  90,  85, -85, -90,  90,
+ 73, -73, -38,  38,  73, -73, -38,  38,  73, -73, -38,  38,  73, -73, -38,  38,
+ -4,   4,  46, -46,  -4,   4,  46, -46,  -4,   4,  46, -46,  -4,   4,  46, -46,
+-78,  78,  90, -90, -78,  78,  90, -90, -78,  78,  90, -90, -78,  78,  90, -90,
+-82,  82,  54, -54, -82,  82,  54, -54, -82,  82,  54, -54, -82,  82,  54, -54,
+-13,  13, -31,  31, -13,  13, -31,  31, -13,  13, -31,  31, -13,  13, -31,  31,
+ 67, -67, -88,  88,  67, -67, -88,  88,  67, -67, -88,  88,  67, -67, -88,  88,
+ 13, -13, -38,  38,  13, -13, -38,  38,  13, -13, -38,  38,  13, -13, -38,  38,  // 112
+ 61, -61, -78,  78,  61, -61, -78,  78,  61, -61, -78,  78,  61, -61, -78,  78,
+ 88, -88, -90,  90,  88, -88, -90,  90,  88, -88, -90,  90,  88, -88, -90,  90,
+ 85, -85, -73,  73,  85, -85, -73,  73,  85, -85, -73,  73,  85, -85, -73,  73,
+ 54, -54, -31,  31,  54, -54, -31,  31,  54, -54, -31,  31,  54, -54, -31,  31,
+  4,  -4,  22, -22,   4,  -4,  22, -22,   4,  -4,  22, -22,   4,  -4,  22, -22,
+-46,  46,  67, -67, -46,  46,  67, -67, -46,  46,  67, -67, -46,  46,  67, -67,
+-82,  82,  90, -90, -82,  82,  90, -90, -82,  82,  90, -90, -82,  82,  90, -90,
+  4,  -4, -13,  13,   4,  -4, -13,  13,   4,  -4, -13,  13,   4,  -4, -13,  13,  // 120
+ 22, -22, -31,  31,  22, -22, -31,  31,  22, -22, -31,  31,  22, -22, -31,  31,
+ 38, -38, -46,  46,  38, -38, -46,  46,  38, -38, -46,  46,  38, -38, -46,  46,
+ 54, -54, -61,  61,  54, -54, -61,  61,  54, -54, -61,  61,  54, -54, -61,  61,
+ 67, -67, -73,  73,  67, -67, -73,  73,  67, -67, -73,  73,  67, -67, -73,  73,
+ 78, -78, -82,  82,  78, -78, -82,  82,  78, -78, -82,  82,  78, -78, -82,  82,
+ 85, -85, -88,  88,  85, -85, -88,  88,  85, -85, -88,  88,  85, -85, -88,  88,
+ 90, -90, -90,  90,  90, -90, -90,  90,  90, -90, -90,  90,  90, -90, -90,  90,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_32x4_coeff_ver[128] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_32x4_coeff_ver[128] = {
+ 29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,
+ 74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+ 84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,
+ 55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,  55, -84,
+ 74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,
+  0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,   0, -74,
+-74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55,
+ 74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,  74, -29,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_32x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_32x4_coeff_ver[128] = {
+ 64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,  64,  83,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+ 64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,  64,  36,
+-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+-64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83, -64,  83,
+ 64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,  64, -83,
+ 64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,  64, -36,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_32x4_coeff_ver[128] = {
+ 29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,  29,  74,
+ 84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,  84,  55,
+ 55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,  55,  74,
+-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74, -74,  74,
+ 84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,  84, -74,
+ 55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,  55, -29,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_32x4_coeff_ver[128] = {
+ 84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,  84,  74,
+ 55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,  55,  29,
+ 74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,  74,   0,
+-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74,
+ 55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,  55, -74,
+-29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84, -29,  84,
+ 29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,  29, -74,
+ 84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,  84, -55,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_32x8_coeff_ver[512] = {
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 0
+ 89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,
+ 83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,
+ 75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+ 50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,
+ 36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,
+ 18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 8
+ 50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,
+-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83,
+-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,
+ 83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,
+ 75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 16
+-18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50,
+-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36,
+ 50,  89,  50,  89,  50,  89,  50,  89,  50,  89,  50,  89,  50,  89,  50,  89,
+ 64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,
+-75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18,
+-36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83, -36,  83,
+ 89, -75,  89, -75,  89, -75,  89, -75,  89, -75,  89, -75,  89, -75,  89, -75,
+ 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  // 24
+-75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89,
+ 36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,
+ 18, -75,  18, -75,  18, -75,  18, -75,  18, -75,  18, -75,  18, -75,  18, -75,
+-64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64,
+ 89, -50,  89, -50,  89, -50,  89, -50,  89, -50,  89, -50,  89, -50,  89, -50,
+-83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36,
+ 50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,  50, -18,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_32x8_coeff_ver[512] = {
+ 17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  // 0
+ 46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,  46,  78,
+ 71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,
+ 85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,  85,  46,
+ 86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,
+ 78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,  78, -71,
+ 60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,
+ 32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,  32, -60,
+ 46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  // 8
+ 86,  71,  86,  71,  86,  71,  86,  71,  86,  71,  86,  71,  86,  71,  86,  71,
+ 32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,
+-60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78,
+-85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32,
+-17,  85, -17,  85, -17,  85, -17,  85, -17,  85, -17,  85, -17,  85, -17,  85,
+ 71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,
+ 78, -86,  78, -86,  78, -86,  78, -86,  78, -86,  78, -86,  78, -86,  78, -86,
+ 71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  // 16
+ 32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,  32, -17,
+-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60,
+ 17,  86,  17,  86,  17,  86,  17,  86,  17,  86,  17,  86,  17,  86,  17,  86,
+ 78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,
+-60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32,
+-46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85,
+ 85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,  85, -71,
+ 85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  // 24
+-60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85,
+ 17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,
+ 32, -71,  32, -71,  32, -71,  32, -71,  32, -71,  32, -71,  32, -71,  32, -71,
+-71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60,
+ 86, -46,  86, -46,  86, -46,  86, -46,  86, -46,  86, -46,  86, -46,  86, -46,
+-78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32,
+ 46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,  46, -17,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_32x8_coeff_ver[512] = {
+ 86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  86,  85,  // 0
+ 85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,  85,  60,
+ 78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,  78,  17,
+ 71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,  71, -32,
+ 60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,  60, -71,
+ 46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,  46, -86,
+ 32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,  32, -78,
+ 17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,  17, -46,
+ 78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  78,  71,  // 8
+ 17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,  17, -32,
+-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86,
+-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17,
+-46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78, -46,  78,
+ 32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,  32,  60,
+ 85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,  85, -46,
+ 71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,  71, -85,
+ 60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  60,  46,  // 16
+-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86,
+-46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32, -46,  32,
+ 78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,  78,  60,
+ 32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,  32, -85,
+-85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17, -85,  17,
+-17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71, -17,  71,
+ 86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,  86, -78,
+ 32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  32,  17,  // 24
+-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46,
+ 85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,  85,  71,
+-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85,
+-17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86, -17,  86,
+ 71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,  71, -78,
+-86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60, -86,  60,
+ 60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,  60, -32,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_32x8_coeff_ver[256] = {
+ 64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  64,  89,  83,  75,  // 0
+ 64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,  64,  50,  36,  18,
+ 64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,  64,  75,  36, -18,
+-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,  64,  50, -36, -89,
+-64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75, -64,  18,  83,  75,
+ 64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  64, -18, -83,  50,  // 8
+ 64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,  64, -75, -36,  89,
+ 64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,  64, -50, -36,  89,
+-64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,  64, -75,  36,  18,
+-64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50, -64,  89, -83,  50,
+ 64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,  64, -50,  36, -18,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_32x8_coeff_ver[256] = {
+ 17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  17,  46,  71,  85,  // 0
+ 86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,  86,  78,  60,  32,
+ 32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,  32,  78,  85,  46,
+-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60,
+ 46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,  46,  86,  32, -60,
+-85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78, -85, -17,  71,  78,
+ 60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,  60,  71, -46, -78,
+ 32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,  32,  85, -17, -86,
+ 71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  71,  32, -86,  17,  // 8
+ 78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,  78, -60, -46,  85,
+ 78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,  78, -17, -60,  86,
+-46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71, -46, -32,  85, -71,
+ 85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,  85, -60,  17,  32,
+-71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46, -71,  86, -78,  46,
+ 86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,  86, -85,  78, -71,
+ 60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,  60, -46,  32, -17,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_32x8_coeff_ver[256] = {
+ 86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  86,  85,  78,  71,  // 0
+ 60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,  60,  46,  32,  17,
+ 85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,  85,  60,  17, -32,
+-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46,
+ 78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,  78,  17, -60, -86,
+-46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71, -46,  32,  85,  71,
+ 71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,  71, -32, -86, -17,
+ 78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,  78,  60, -46, -85,
+ 60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  60, -71, -46,  78,  // 8
+ 32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,  32, -85, -17,  86,
+ 46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,  46, -86,  32,  60,
+-85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78, -85,  17,  71, -78,
+ 32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,  32, -78,  85, -46,
+-17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60, -17,  71, -86,  60,
+ 17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,  17, -46,  71, -85,
+ 86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,  86, -78,  60, -32,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_32x16_coeff_ver[256] = {
+ 64,  64,  90,  87,  89,  75,  87,  57,  83,  36,  80,   9,  75, -18,  70, -43,  // 0
+ 64, -64,  57, -80,  50, -89,  43, -90,  36, -83,  25, -70,  18, -50,   9, -25,
+ 64,  64,  80,  70,  50,  18,   9, -43, -36, -83, -70, -87, -89, -50, -87,   9,
+-64,  64, -25,  90,  18,  75,  57,  25,  83, -36,  90, -80,  75, -89,  43, -57,
+ 64,  64,  57,  43, -18, -50, -80, -90, -83, -36, -25,  57,  50,  89,  90,  25,
+ 64, -64,  -9, -87, -75, -18, -87,  70, -36,  83,  43,   9,  89, -75,  70, -80,
+ 64,  64,  25,   9, -75, -89, -70, -25,  36,  83,  90,  43,  18, -75, -80, -57,
+-64,  64,  43,  70,  89, -50,   9, -80, -83,  36, -57,  87,  50, -18,  87, -90,
+ 64,  64,  -9, -25, -89, -75,  25,  70,  83,  36, -43, -90, -75,  18,  57,  80,  // 8
+ 64, -64, -70, -43, -50,  89,  80,  -9,  36, -83, -87,  57, -18,  50,  90, -87,
+ 64,  64, -43, -57, -50, -18,  90,  80, -36, -83, -57,  25,  89,  50, -25, -90,
+-64,  64,  87,   9, -18, -75, -70,  87,  83, -36,  -9, -43, -75,  89,  80, -70,
+ 64,  64, -70, -80,  18,  50,  43,  -9, -83, -36,  87,  70, -50, -89,  -9,  87,
+ 64, -64, -90,  25,  75,  18, -25, -57, -36,  83,  80, -90, -89,  75,  57, -43,
+ 64,  64, -87, -90,  75,  89, -57, -87,  36,  83,  -9, -80, -18,  75,  43, -70,
+-64,  64,  80, -57, -89,  50,  90, -43, -83,  36,  70, -25, -50,  18,  25,  -9,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_32x16_coeff_ver[256] = {
+  8,  17,  25,  48,  40,  73,  55,  87,  68,  88,  77,  77,  85,  55,  88,  25,  // 0
+ 88,  -8,  87, -40,  81, -68,  73, -85,  62, -88,  48, -81,  33, -62,  17, -33,
+ 25,  33,  68,  81,  88,  85,  81,  40,  48, -25,   0, -77, -48, -87, -81, -48,
+-88,  17, -68,  73, -25,  88,  25,  55,  68,  -8,  88, -68,  81, -88,  48, -62,
+ 40,  48,  88,  88,  62,  25, -17, -68, -81, -81, -77,   0,  -8,  81,  68,  68,
+ 87, -25,  33, -88, -48, -48, -88,  48, -55,  88,  25,  25,  85, -68,  73, -81,
+ 55,  62,  81,  68, -17, -55, -88, -73, -25,  48,  77,  77,  62, -40, -48, -81,
+-85,  33,   8,  85,  88, -25,  33, -87, -73,  17, -68,  88,  40,  -8,  87, -88,
+ 68,  73,  48,  25, -81, -88, -25,  33,  88,  68,   0, -77, -88, -17,  25,  88,  // 8
+ 81, -40, -48, -62, -68,  81,  68,   8,  48, -87, -81,  48, -25,  55,  88, -85,
+ 77,  81,   0, -25, -77, -48,  77,  88,   0, -68, -77,   0,  77,  68,   0, -88,
+-77,  48,  77,  25,   0, -81, -77,  81,  77, -25,   0, -48, -77,  88,  77, -68,
+ 85,  87, -48, -68,  -8,  33,  62,   8, -88, -48,  77,  77, -33, -88, -25,  81,
+ 73, -55, -88,  17,  68,  25, -17, -62, -40,  85,  81, -88, -87,  73,  55, -40,
+ 88,  88, -81, -88,  68,  87, -48, -85,  25,  81,   0, -77, -25,  73,  48, -68,
+-68,  62,  81, -55, -88,  48,  88, -40, -81,  33,  68, -25, -48,  17,  25,  -8,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_32x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+ALIGNED(32) static const int16_t  fi_dct2_32x16_coeff_ver[256] = {
+ 64,  90,  64,  87,  64,  80,  64,  70,  64,  57,  64,  43,  64,  25,  64,   9,  // 0
+ 64,  -9,  64, -25,  64, -43,  64, -57,  64, -70,  64, -80,  64, -87,  64, -90,
+ 89,  87,  75,  57,  50,   9,  18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
+-89,  25, -75,  70, -50,  90, -18,  80,  18,  43,  50,  -9,  75, -57,  89, -87,
+ 83,  80,  36,   9, -36, -70, -83, -87, -83, -25, -36,  57,  36,  90,  83,  43,
+ 83, -43,  36, -90, -36, -57, -83,  25, -83,  87, -36,  70,  36,  -9,  83, -80,
+ 75,  70, -18, -43, -89, -87, -50,   9,  50,  90,  89,  25,  18, -80, -75, -57,
+-75,  57,  18,  80,  89, -25,  50, -90, -50,  -9, -89,  87, -18,  43,  75, -70,
+ 64,  57, -64, -80, -64, -25,  64,  90,  64,  -9, -64, -87, -64,  43,  64,  70,  // 8
+ 64, -70, -64, -43, -64,  87,  64,   9,  64, -90, -64,  25, -64,  80,  64, -57,
+ 50,  43, -89, -90,  18,  57,  75,  25, -75, -87, -18,  70,  89,   9, -50, -80,
+-50,  80,  89,  -9, -18, -70, -75,  87,  75, -25,  18, -57, -89,  90,  50, -43,
+ 36,  25, -83, -70,  83,  90, -36, -80, -36,  43,  83,   9, -83, -57,  36,  87,
+ 36, -87, -83,  57,  83,  -9, -36, -43, -36,  80,  83, -90, -83,  70,  36, -25,
+ 18,   9, -50, -25,  75,  43, -89, -57,  89,  70, -75, -80,  50,  87, -18, -90,
+-18,  90,  50, -87, -75,  80,  89, -70, -89,  57,  75, -43, -50,  25,  18,  -9,
+};
+
+ALIGNED(32) static const int16_t  fi_dst7_32x16_coeff_ver[256] = {
+  8,  25,  17,  48,  25,  68,  33,  81,  40,  88,  48,  88,  55,  81,  62,  68,  // 0
+ 68,  48,  73,  25,  77,   0,  81, -25,  85, -48,  87, -68,  88, -81,  88, -88,
+ 40,  55,  73,  87,  88,  81,  85,  40,  62, -17,  25, -68, -17, -88, -55, -73,
+-81, -25, -88,  33, -77,  77, -48,  88,  -8,  62,  33,   8,  68, -48,  87, -85,
+ 68,  77,  88,  77,  48,   0, -25, -77, -81, -77, -81,   0, -25,  77,  48,  77,
+ 88,   0,  68, -77,   0, -77, -68,   0, -88,  77, -48,  77,  25,   0,  81, -77,
+ 85,  88,  55,  25, -48, -81, -87, -48,  -8,  68,  81,  68,  62, -48, -40, -81,
+-88,  25, -17,  88,  77,   0,  68, -88, -33, -25, -88,  81, -25,  48,  73, -68,
+ 88,  87,  -8, -40, -88, -68,  17,  73,  87,  33, -25, -88, -85,   8,  33,  85,  // 8
+ 81, -48, -40, -62, -77,  77,  48,  25,  73, -88, -55,  17, -68,  81,  62, -55,
+ 81,  73, -68, -85, -25,  25,  88,  55, -48, -88, -48,  48,  88,  33, -25, -87,
+-68,  68,  81,   8,   0, -77, -81,  81,  68, -17,  25, -62, -88,  88,  48, -40,
+ 62,  48, -88, -81,  68,  88,  -8, -68, -55,  25,  88,  25, -73, -68,  17,  88,
+ 48, -81, -87,  48,  77,   0, -25, -48, -40,  81,  85, -88, -81,  68,  33, -25,
+ 33,  17, -62, -33,  81,  48, -88, -62,  85,  73, -68, -81,  40,  87,  -8, -88,
+-25,  88,  55, -85, -77,  77,  88, -68, -87,  55,  73, -40, -48,  25,  17,  -8,
+};
+
+ALIGNED(32) static const int16_t  fi_dct8_32x16_coeff_ver[256] = {
+ 88,  88,  88,  81,  87,  68,  85,  48,  81,  25,  77,   0,  73, -25,  68, -48,  // 0
+ 62, -68,  55, -81,  48, -88,  40, -88,  33, -81,  25, -68,  17, -48,   8, -25,
+ 87,  85,  68,  48,  33,  -8,  -8, -62, -48, -88, -77, -77, -88, -33, -81,  25,
+-55,  73, -17,  88,  25,  68,  62,  17,  85, -40,  88, -81,  73, -87,  40, -55,
+ 81,  77,  25,   0, -48, -77, -88, -77, -68,   0,   0,  77,  68,  77,  88,   0,
+ 48, -77, -25, -77, -81,   0, -81,  77, -25,  77,  48,   0,  88, -77,  68, -77,
+ 73,  68, -25, -48, -88, -81, -33,  25,  68,  88,  77,   0, -17, -88, -88, -25,
+-40,  81,  62,  48,  81, -68,  -8, -68, -87,  48, -48,  81,  55, -25,  85, -88,
+ 62,  55, -68, -81, -55, -17,  73,  88,  48, -25, -77, -77, -40,  62,  81,  48,  // 8
+ 33, -85, -85,  -8, -25,  88,  87, -33,  17, -73, -88,  68,  -8,  40,  88, -87,
+ 48,  40, -88, -88,  25,  62,  68,  17, -81, -81,   0,  77,  81,  -8, -68, -68,
+-25,  87,  88, -33, -48, -48, -48,  88,  88, -55, -25, -25, -68,  85,  81, -73,
+ 33,  25, -81, -68,  85,  88, -40, -81, -25,  48,  77,   0, -87, -48,  48,  81,
+ 17, -88, -73,  68,  88, -25, -55, -25,  -8,  68,  68, -88, -88,  81,  62, -48,
+ 17,   8, -48, -25,  73,  40, -87, -55,  88,  68, -77, -77,  55,  85, -25, -88,
+ -8,  88,  40, -87, -68,  81,  85, -73, -88,  62,  81, -48, -62,  33,  33, -17,
+};
+
+
+ALIGNED(32) static const int16_t  ff_dct2_32x32_coeff_ver[1024] = {
+ 64,  64,  90,  90,  90,  87,  90,  82,  89,  75,  88,  67,  87,  57,  85,  46,  // 0
+ 83,  36,  82,  22,  80,   9,  78,  -4,  75, -18,  73, -31,  70, -43,  67, -54,
+ 64, -64,  61, -73,  57, -80,  54, -85,  50, -89,  46, -90,  43, -90,  38, -88,
+ 36, -83,  31, -78,  25, -70,  22, -61,  18, -50,  13, -38,   9, -25,   4, -13,
+ 64,  64,  88,  85,  80,  70,  67,  46,  50,  18,  31, -13,   9, -43, -13, -67,
+-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87,   9, -78,  38,
+-64,  64, -46,  82, -25,  90,  -4,  88,  18,  75,  38,  54,  57,  25,  73,  -4,
+ 83, -36,  90, -61,  90, -80,  85, -90,  75, -89,  61, -78,  43, -57,  22, -31,
+ 64,  64,  82,  78,  57,  43,  22,  -4, -18, -50, -54, -82, -80, -90, -90, -73,  // 8
+-83, -36, -61,  13, -25,  57,  13,  85,  50,  89,  78,  67,  90,  25,  85, -22,
+ 64, -64,  31, -88,  -9, -87, -46, -61, -75, -18, -90,  31, -87,  70, -67,  90,
+-36,  83,   4,  54,  43,   9,  73, -38,  89, -75,  88, -90,  70, -80,  38, -46,
+ 64,  64,  73,  67,  25,   9, -31, -54, -75, -89, -90, -78, -70, -25, -22,  38,
+ 36,  83,  78,  85,  90,  43,  67, -22,  18, -75, -38, -90, -80, -57, -90,   4,
+-64,  64, -13,  90,  43,  70,  82,  13,  89, -50,  61, -88,   9, -80, -46, -31,
+-83,  36, -88,  82, -57,  87,  -4,  46,  50, -18,  85, -73,  87, -90,  54, -61,
+ 64,  64,  61,  54,  -9, -25, -73, -85, -89, -75, -46,  -4,  25,  70,  82,  88,  // 16
+ 83,  36,  31, -46, -43, -90, -88, -61, -75,  18, -13,  82,  57,  80,  90,  13,
+ 64, -64,  -4, -90, -70, -43, -90,  38, -50,  89,  22,  67,  80,  -9,  85, -78,
+ 36, -83, -38, -22, -87,  57, -78,  90, -18,  50,  54, -31,  90, -87,  67, -73,
+ 64,  64,  46,  38, -43, -57, -90, -88, -50, -18,  38,  73,  90,  80,  54,  -4,
+-36, -83, -90, -67, -57,  25,  31,  90,  89,  50,  61, -46, -25, -90, -88, -31,
+-64,  64,  22,  85,  87,   9,  67, -78, -18, -75, -85,  13, -70,  87,  13,  61,
+ 83, -36,  73, -90,  -9, -43, -82,  54, -75,  89,   4,  22,  80, -70,  78, -82,
+ 64,  64,  31,  22, -70, -80, -78, -61,  18,  50,  90,  85,  43,  -9, -61, -90,  // 24
+-83, -36,   4,  73,  87,  70,  54, -38, -50, -89, -88,  -4,  -9,  87,  82,  46,
+ 64, -64, -38, -78, -90,  25, -22,  90,  75,  18,  73, -82, -25, -57, -90,  54,
+-36,  83,  67, -13,  80, -90, -13, -31, -89,  75, -46,  67,  57, -43,  85, -88,
+ 64,  64,  13,   4, -87, -90, -38, -13,  75,  89,  61,  22, -57, -87, -78, -31,
+ 36,  83,  88,  38,  -9, -80, -90, -46, -18,  75,  85,  54,  43, -70, -73, -61,
+-64,  64,  54,  67,  80, -57, -31, -73, -89,  50,   4,  78,  90, -43,  22, -82,
+-83,  36, -46,  85,  70, -25,  67, -88, -50,  18, -82,  90,  25,  -9,  90, -90,
+ 64,  64,  -4, -13, -90, -87,  13,  38,  89,  75, -22, -61, -87, -57,  31,  78,  // 32
+ 83,  36, -38, -88, -80,  -9,  46,  90,  75, -18, -54, -85, -70,  43,  61,  73,
+ 64, -64, -67, -54, -57,  80,  73,  31,  50, -89, -78,  -4, -43,  90,  82, -22,
+ 36, -83, -85,  46, -25,  70,  88, -67,  18, -50, -90,  82,  -9,  25,  90, -90,
+ 64,  64, -22, -31, -80, -70,  61,  78,  50,  18, -85, -90,  -9,  43,  90,  61,
+-36, -83, -73,  -4,  70,  87,  38, -54, -89, -50,   4,  88,  87,  -9, -46, -82,
+-64,  64,  78,  38,  25, -90, -90,  22,  18,  75,  82, -73, -57, -25, -54,  90,
+ 83, -36,  13, -67, -90,  80,  31,  13,  75, -89, -67,  46, -43,  57,  88, -85,
+ 64,  64, -38, -46, -57, -43,  88,  90, -18, -50, -73, -38,  80,  90,   4, -54,  // 40
+-83, -36,  67,  90,  25, -57, -90, -31,  50,  89,  46, -61, -90, -25,  31,  88,
+ 64, -64, -85, -22,   9,  87,  78, -67, -75, -18, -13,  85,  87, -70, -61, -13,
+-36,  83,  90, -73, -43,  -9, -54,  82,  89, -75, -22,  -4, -70,  80,  82, -78,
+ 64,  64, -54, -61, -25,  -9,  85,  73, -75, -89,   4,  46,  70,  25, -88, -82,
+ 36,  83,  46, -31, -90, -43,  61,  88,  18, -75, -82,  13,  80,  57, -13, -90,
+-64,  64,  90,   4, -43, -70, -38,  90,  89, -50, -67, -22,  -9,  80,  78, -85,
+-83,  36,  22,  38,  57, -87, -90,  78,  50, -18,  31, -54, -87,  90,  73, -67,
+ 64,  64, -67, -73,   9,  25,  54,  31, -89, -75,  78,  90, -25, -70, -38,  22,  // 48
+ 83,  36, -85, -78,  43,  90,  22, -67, -75,  18,  90,  38, -57, -80,  -4,  90,
+ 64, -64, -90,  13,  70,  43, -13, -82, -50,  89,  88, -61, -80,   9,  31,  46,
+ 36, -83, -82,  88,  87, -57, -46,   4, -18,  50,  73, -85, -90,  87,  61, -54,
+ 64,  64, -78, -82,  43,  57,   4, -22, -50, -18,  82,  54, -90, -80,  73,  90,
+-36, -83, -13,  61,  57, -25, -85, -13,  89,  50, -67, -78,  25,  90,  22, -85,
+-64,  64,  88, -31, -87,  -9,  61,  46, -18, -75, -31,  90,  70, -87, -90,  67,
+ 83, -36, -54,  -4,   9,  43,  38, -73, -75,  89,  90, -88, -80,  70,  46, -38,
+ 64,  64, -85, -88,  70,  80, -46, -67,  18,  50,  13, -31, -43,   9,  67,  13,  // 56
+-83, -36,  90,  54, -87, -70,  73,  82, -50, -89,  22,  90,   9, -87, -38,  78,
+ 64, -64, -82,  46,  90, -25, -88,   4,  75,  18, -54, -38,  25,  57,   4, -73,
+-36,  83,  61, -90, -80,  90,  90, -85, -89,  75,  78, -61, -57,  43,  31, -22,
+ 64,  64, -90, -90,  87,  90, -82, -90,  75,  89, -67, -88,  57,  87, -46, -85,
+ 36,  83, -22, -82,   9,  80,   4, -78, -18,  75,  31, -73, -43,  70,  54, -67,
+-64,  64,  73, -61, -80,  57,  85, -54, -89,  50,  90, -46, -90,  43,  88, -38,
+-83,  36,  78, -31, -70,  25,  61, -22, -50,  18,  38, -13, -25,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dst7_32x32_coeff_ver[1024] = {
+  4,   9,  13,  26,  21,  42,  30,  56,  38,  68,  46,  78,  53,  85,  60,  89,  // 0
+ 66,  90,  72,  86,  77,  80,  80,  72,  84,  60,  86,  46,  88,  30,  90,  13,
+ 90,  -4,  89, -21,  87, -38,  85, -53,  82, -66,  78, -77,  74, -84,  68, -88,
+ 63, -90,  56, -87,  50, -82,  42, -74,  34, -63,  26, -50,  17, -34,   9, -17,
+ 13,  17,  38,  50,  60,  74,  77,  87,  86,  88,  90,  77,  85,  53,  74,  21,
+ 56, -13,  34, -46,   9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26,
+-90,   9, -84,  42, -72,  68, -53,  85, -30,  90,  -4,  80,  21,  60,  46,  30,
+ 66,  -4,  80, -38,  88, -66,  89, -84,  82, -90,  68, -82,  50, -63,  26, -34,
+ 21,  26,  60,  68,  84,  89,  89,  80,  74,  46,  42,  -4,   0, -53, -42, -84,  // 8
+-74, -87, -89, -63, -84, -17, -60,  34, -21,  74,  21,  90,  60,  77,  84,  38,
+ 89, -13,  74, -60,  42, -86,   0, -85, -42, -56, -74,  -9, -89,  42, -84,  78,
+-60,  90, -21,  72,  21,  30,  60, -21,  84, -66,  89, -88,  74, -82,  42, -50,
+ 30,  34,  77,  82,  89,  84,  63,  38,   9, -30, -50, -80, -85, -85, -84, -42,
+-46,  26,  13,  78,  66,  86,  90,  46,  74, -21,  26, -77, -34, -87, -78, -50,
+-88,  17, -60,  74,  -4,  88,  53,  53,  86, -13,  82, -72,  42, -89, -17, -56,
+-68,   9, -90,  68, -72,  90, -21,  60,  38,  -4,  80, -66,  87, -90,  56, -63,
+ 38,  42,  86,  89,  74,  60,   9, -21, -63, -84, -90, -74, -53,   0,  21,  74,  // 16
+ 80,  84,  82,  21,  26, -60, -50, -89, -89, -42, -66,  42,   4,  89,  72,  60,
+ 87, -21,  42, -84, -34, -74, -85,   0, -77,  74, -13,  84,  60,  21,  90, -60,
+ 56, -89, -17, -42, -78,  42, -84,  89, -30,  60,  46, -21,  88, -84,  68, -74,
+ 46,  50,  90,  88,  42,  21, -50, -72, -90, -78, -38,   9,  53,  85,  89,  60,
+ 34, -38, -56, -90, -88, -34, -30,  63,  60,  84,  87,   4,  26, -80, -63, -68,
+-86,  26, -21,  89,  66,  46,  85, -53,  17, -87, -68, -17, -84,  74, -13,  77,
+ 72, -13,  82, -86,   9, -56, -74,  42, -80,  90,  -4,  30,  77, -66,  78, -82,
+ 53,  56,  85,  80,   0, -21, -85, -90, -53, -17,  53,  82,  85,  53,   0, -60,  // 24
+-85, -78, -53,  26,  53,  90,  85,  13,   0, -84, -85, -50, -53,  63,  53,  77,
+ 85, -30,   0, -89, -85,  -9, -53,  85,  53,  46,  85, -66,   0, -74, -85,  34,
+-53,  88,  53,   4,  85, -86,   0, -42, -85,  68, -53,  72,  53, -38,  85, -87,
+ 60,  63,  74,  66, -42, -60, -84, -68,  21,  56,  89,  72,   0, -53, -89, -74,
+-21,  50,  84,  77,  42, -46, -74, -78, -60,  42,  60,  80,  74, -38, -42, -82,
+-84,  34,  21,  84,  89, -30,   0, -85, -89,  26, -21,  86,  84, -21,  42, -87,
+-74,  17, -60,  88,  60, -13,  74, -89, -42,   9, -84,  90,  21,  -4,  89, -90,
+ 66,  68,  56,  46, -74, -84, -46, -17,  80,  90,  34, -13, -85, -85, -21,  42,  // 32
+ 88,  72,   9, -66, -90, -50,   4,  82,  89,  21, -17, -90, -86,   9,  30,  86,
+ 82, -38, -42, -74, -77,  63,  53,  53,  68, -80, -63, -26, -60,  89,  72,  -4,
+ 50, -87, -78,  34, -38,  77,  84, -60,  26, -56, -87,  78, -13,  30,  90, -88,
+ 72,  74,  34,  21, -89, -89,  13,  42,  82,  60, -56, -84, -53,   0,  84,  84,
+  9, -60, -88, -42,  38,  89,  68, -21, -74, -74, -30,  74,  90,  21, -17, -89,
+-80,  42,  60,  60,  50, -84, -85,   0,  -4,  84,  87, -60, -42, -42, -66,  89,
+ 77, -21,  26, -74, -90,  74,  21,  21,  78, -89, -63,  42, -46,  60,  86, -84,
+ 77,  78,   9,  -4, -84, -74,  66,  82,  26, -13, -88, -68,  53,  85,  42, -21,  // 40
+-90, -63,  38,  87,  56, -30, -87, -56,  21,  89,  68, -38, -82, -50,   4,  90,
+ 78, -46, -74, -42, -13,  90,  85, -53, -63, -34, -30,  88,  89, -60, -50, -26,
+-46,  86,  90, -66, -34, -17, -60,  84,  86, -72, -17,  -9, -72,  80,  80, -77,
+ 80,  82, -17, -30, -60, -42,  90,  86, -50, -77, -30,  17,  85,  53, -74, -89,
+  4,  68,  68,  -4, -87, -63,  38,  90,  42, -60, -88,  -9,  66,  72,   9, -88,
+-77,  50,  84,  21, -26, -78, -53,  85,  90, -38, -56, -34, -21,  84,  82, -80,
+-78,  26,  13,  46,  63, -87, -89,  74,  46, -13,  34, -56, -86,  90,  72, -66,
+ 84,  85, -42, -53, -21,   0,  74,  53, -89, -85,  60,  85,   0, -53, -60,   0,  // 48
+ 89,  53, -74, -85,  21,  85,  42, -53, -84,   0,  84,  53, -42, -85, -21,  85,
+ 74, -53, -89,   0,  60,  53,   0, -85, -60,  85,  89, -53, -74,   0,  21,  53,
+ 42, -85, -84,  85,  84, -53, -42,   0, -21,  53,  74, -85, -89,  85,  60, -53,
+ 86,  87, -63, -72,  21,  42,  26,  -4, -66, -34,  87,  66, -85, -85,  60,  89,
+-17, -77, -30,  50,  68, -13, -88, -26,  84,  60, -56, -82,  13,  90,  34, -80,
+-72,  56,  89, -21, -82, -17,  53,  53,  -9, -78, -38,  90,  74, -84, -90,  63,
+ 80, -30, -50,  -9,   4,  46,  42, -74, -77,  88,  90, -86, -78,  68,  46, -38,
+ 88,  89, -78, -84,  60,  74, -34, -60,   4,  42,  26, -21, -53,   0,  74,  21,  // 56
+-86, -42,  90,  60, -82, -74,  66,  84, -42, -89,  13,  89,  17, -84, -46,  74,
+ 68, -60, -84,  42,  90, -21, -85,   0,  72,  21, -50, -42,  21,  60,   9, -74,
+-38,  84,  63, -89, -80,  89,  89, -84, -87,  74,  77, -60, -56,  42,  30, -21,
+ 90,  90, -87, -90,  84,  89, -78, -88,  72,  87, -63, -86,  53,  85, -42, -84,
+ 30,  82, -17, -80,   4,  78,   9, -77, -21,  74,  34, -72, -46,  68,  56, -66,
+-66,  63,  74, -60, -80,  56,  85, -53, -88,  50,  90, -46, -89,  42,  86, -38,
+-82,  34,  77, -30, -68,  26,  60, -21, -50,  17,  38, -13, -26,   9,  13,  -4,
+};
+
+ALIGNED(32) static const int16_t  ff_dct8_32x32_coeff_ver[1024] = {
+ 90,  90,  90,  87,  89,  84,  88,  78,  87,  72,  86,  63,  85,  53,  84,  42,  // 0
+ 82,  30,  80,  17,  78,   4,  77,  -9,  74, -21,  72, -34,  68, -46,  66, -56,
+ 63, -66,  60, -74,  56, -80,  53, -85,  50, -88,  46, -90,  42, -89,  38, -86,
+ 34, -82,  30, -77,  26, -68,  21, -60,  17, -50,  13, -38,   9, -26,   4, -13,
+ 89,  88,  84,  78,  74,  60,  60,  34,  42,   4,  21, -26,   0, -53, -21, -74,
+-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84,  17, -74,  46,
+-60,  68, -42,  84, -21,  90,   0,  85,  21,  72,  42,  50,  60,  21,  74,  -9,
+ 84, -38,  89, -63,  89, -80,  84, -89,  74, -87,  60, -77,  42, -56,  21, -30,
+ 87,  86,  72,  63,  42,  21,   4, -26, -34, -66, -66, -87, -85, -85, -89, -60,  // 8
+-77, -17, -50,  30, -13,  68,  26,  88,  60,  84,  82,  56,  90,  13,  80, -34,
+ 56, -72,  21, -89, -17, -82, -53, -53, -78,  -9, -90,  38, -84,  74, -63,  90,
+-30,  80,   9,  50,  46,   4,  74, -42,  88, -77,  86, -90,  68, -78,  38, -46,
+ 85,  84,  53,  42,   0, -21, -53, -74, -85, -89, -85, -60, -53,   0,   0,  60,
+ 53,  89,  85,  74,  85,  21,  53, -42,   0, -84, -53, -84, -85, -42, -85,  21,
+-53,  74,   0,  89,  53,  60,  85,   0,  85, -60,  53, -89,   0, -74, -53, -21,
+-85,  42, -85,  84, -53,  84,   0,  42,  53, -21,  85, -74,  85, -89,  53, -60,
+ 82,  80,  30,  17, -42, -60, -86, -90, -77, -50, -17,  30,  53,  85,  89,  74,  // 16
+ 68,   4,   4, -68, -63, -87, -90, -38, -60,  42,   9,  88,  72,  66,  88,  -9,
+ 50, -77, -21, -84, -78, -26, -85,  53, -38,  90,  34,  56,  84, -21,  80, -82,
+ 26, -78, -46, -13, -87,  63, -74,  89, -13,  46,  56, -34,  90, -86,  66, -72,
+ 78,  77,   4,  -9, -74, -84, -82, -66, -13,  26,  68,  88,  85,  53,  21, -42,
+-63, -90, -87, -38, -30,  56,  56,  87,  89,  21,  38, -68, -50, -82, -90,  -4,
+-46,  78,  42,  74,  90, -13,  53, -85, -34, -63, -88,  30, -60,  89,  26,  50,
+ 86, -46,  66, -90, -17, -34, -84,  60, -72,  86,   9,  17,  80, -72,  77, -80,
+ 74,  72, -21, -34, -89, -89, -42, -13,  60,  82,  84,  56,   0, -53, -84, -84,  // 24
+-60,   9,  42,  88,  89,  38,  21, -68, -74, -74, -74,  30,  21,  90,  89,  17,
+ 42, -80, -60, -60, -84,  50,   0,  85,  84,  -4,  60, -87, -42, -42, -89,  66,
+-21,  77,  74, -26,  74, -90, -21, -21, -89,  78, -42,  63,  60, -46,  84, -86,
+ 68,  66, -46, -56, -84, -74,  17,  46,  90,  80,  13, -34, -85, -85, -42,  21,
+ 72,  88,  66,  -9, -50, -90, -82,  -4,  21,  89,  90,  17,   9, -86, -86, -30,
+-38,  82,  74,  42,  63, -77, -53, -53, -80,  68,  26,  63,  89, -60,   4, -72,
+-87,  50, -34,  78,  77, -38,  60, -84, -56,  26, -78,  87,  30, -13,  88, -90,
+ 63,  60, -66, -74, -60, -42,  68,  84,  56,  21, -72, -89, -53,   0,  74,  89,  // 32
+ 50, -21, -77, -84, -46,  42,  78,  74,  42, -60, -80, -60, -38,  74,  82,  42,
+ 34, -84, -84, -21, -30,  89,  85,   0,  26, -89, -86,  21, -21,  84,  87, -42,
+ 17, -74, -88,  60, -13,  60,  89, -74,   9, -42, -90,  84,  -4,  21,  90, -89,
+ 56,  53, -80, -85, -21,   0,  90,  85, -17, -53, -82, -53,  53,  85,  60,   0,
+-78, -85, -26,  53,  90,  53, -13, -85, -84,   0,  50,  85,  63, -53, -77, -53,
+-30,  85,  89,   0,  -9, -85, -85,  53,  46,  53,  66, -85, -74,   0, -34,  85,
+ 88, -53,  -4, -53, -86,  85,  42,   0,  68, -85, -72,  53, -38,  53,  87, -85,
+ 50,  46, -88, -90,  21,  42,  72,  50, -78, -90,  -9,  38,  85,  53, -60, -89,  // 40
+-38,  34,  90,  56, -34, -88, -63,  30,  84,  60,  -4, -87, -80,  26,  68,  63,
+ 26, -86, -89,  21,  46,  66,  53, -85, -87,  17,  17,  68,  74, -84, -77,  13,
+-13,  72,  86, -82, -56,   9, -42,  74,  90, -80, -30,   4, -66,  77,  82, -78,
+ 42,  38, -89, -86,  60,  74,  21,  -9, -84, -63,  74,  90,   0, -53, -74, -21,
+ 84,  80, -21, -82, -60,  26,  89,  50, -42, -89, -42,  66,  89,   4, -60, -72,
+-21,  87,  84, -42, -74, -34,   0,  85,  74, -77, -84,  13,  21,  60,  60, -90,
+-89,  56,  42,  17,  42, -78, -89,  84,  60, -30,  21, -46, -84,  88,  74, -68,
+ 34,  30, -82, -77,  84,  89, -38, -63, -30,   9,  80,  50, -85, -85,  42,  84,  // 48
+ 26, -46, -78, -13,  86,  66, -46, -90, -21,  74,  77, -26, -87, -34,  50,  78,
+ 17, -88, -74,  60,  88,  -4, -53, -53, -13,  86,  72, -82, -89,  42,  56,  17,
+  9, -68, -68,  90,  90, -72, -60,  21,  -4,  38,  66, -80, -90,  87,  63, -56,
+ 26,  21, -68, -60,  89,  84, -80, -89,  46,  74,   4, -42, -53,   0,  84,  42,
+-87, -74,  63,  89, -17, -84, -34,  60,  74, -21, -90, -21,  77,  60, -38, -84,
+-13,  89,  60, -74, -86,  42,  85,   0, -56, -42,   9,  74,  42, -89, -78,  84,
+ 90, -60, -72,  21,  30,  21,  21, -60, -66,  84,  88, -89, -82,  74,  50, -42,
+ 17,  13, -50, -38,  74,  60, -87, -77,  88,  86, -77, -90,  53,  85, -21, -74,  // 56
+-13,  56,  46, -34, -72,   9,  86,  17, -89, -42,  78,  63, -56, -78,  26,  87,
+  9, -90, -42,  84,  68, -72, -85,  53,  90, -30, -80,   4,  60,  21, -30, -46,
+ -4,  66,  38, -80, -66,  88,  84, -89, -90,  82,  82, -68, -63,  50,  34, -26,
+  9,   4, -26, -13,  42,  21, -56, -30,  68,  38, -78, -46,  85,  53, -89, -60,
+ 90,  66, -86, -72,  80,  77, -72, -80,  60,  84, -46, -86,  30,  88, -13, -90,
+ -4,  90,  21, -89, -38,  87,  53, -85, -66,  82,  77, -78, -84,  74,  88, -68,
+-90,  63,  87, -56, -82,  50,  74, -42, -63,  34,  50, -26, -34,  17,  17,  -9,
+};
+
+
+  // DCT-2
+#define DEFINE_DCT2_P2_MATRIX(a) \
+{ \
+  {a,  a}, \
+  {a, -a}  \
+}
+
+#define DEFINE_DCT2_P4_MATRIX(a,b,c) \
+{ \
+  { a,  a,  a,  a}, \
+  { b,  c, -c, -b}, \
+  { a, -a, -a,  a}, \
+  { c, -b,  b, -c}  \
+}
+
+#define DEFINE_DCT2_P8_MATRIX(a,b,c,d,e,f,g) \
+{ \
+  { a,  a,  a,  a,  a,  a,  a,  a}, \
+  { d,  e,  f,  g, -g, -f, -e, -d}, \
+  { b,  c, -c, -b, -b, -c,  c,  b}, \
+  { e, -g, -d, -f,  f,  d,  g, -e}, \
+  { a, -a, -a,  a,  a, -a, -a,  a}, \
+  { f, -d,  g,  e, -e, -g,  d, -f}, \
+  { c, -b,  b, -c, -c,  b, -b,  c}, \
+  { g, -f,  e, -d,  d, -e,  f, -g}  \
+}
+
+#define DEFINE_DCT2_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
+{ \
+  { a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a}, \
+  { h,  i,  j,  k,  l,  m,  n,  o, -o, -n, -m, -l, -k, -j, -i, -h}, \
+  { d,  e,  f,  g, -g, -f, -e, -d, -d, -e, -f, -g,  g,  f,  e,  d}, \
+  { i,  l,  o, -m, -j, -h, -k, -n,  n,  k,  h,  j,  m, -o, -l, -i}, \
+  { b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b}, \
+  { j,  o, -k, -i, -n,  l,  h,  m, -m, -h, -l,  n,  i,  k, -o, -j}, \
+  { e, -g, -d, -f,  f,  d,  g, -e, -e,  g,  d,  f, -f, -d, -g,  e}, \
+  { k, -m, -i,  o,  h,  n, -j, -l,  l,  j, -n, -h, -o,  i,  m, -k}, \
+  { a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a}, \
+  { l, -j, -n,  h, -o, -i,  m,  k, -k, -m,  i,  o, -h,  n,  j, -l}, \
+  { f, -d,  g,  e, -e, -g,  d, -f, -f,  d, -g, -e,  e,  g, -d,  f}, \
+  { m, -h,  l,  n, -i,  k,  o, -j,  j, -o, -k,  i, -n, -l,  h, -m}, \
+  { c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c}, \
+  { n, -k,  h, -j,  m,  o, -l,  i, -i,  l, -o, -m,  j, -h,  k, -n}, \
+  { g, -f,  e, -d,  d, -e,  f, -g, -g,  f, -e,  d, -d,  e, -f,  g}, \
+  { o, -n,  m, -l,  k, -j,  i, -h,  h, -i,  j, -k,  l, -m,  n, -o}  \
+}
+
+#define DEFINE_DCT2_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E) \
+{ \
+  { a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a,  a}, \
+  { p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E, -E, -D, -C, -B, -A, -z, -y, -x, -w, -v, -u, -t, -s, -r, -q, -p}, \
+  { h,  i,  j,  k,  l,  m,  n,  o, -o, -n, -m, -l, -k, -j, -i, -h, -h, -i, -j, -k, -l, -m, -n, -o,  o,  n,  m,  l,  k,  j,  i,  h}, \
+  { q,  t,  w,  z,  C, -E, -B, -y, -v, -s, -p, -r, -u, -x, -A, -D,  D,  A,  x,  u,  r,  p,  s,  v,  y,  B,  E, -C, -z, -w, -t, -q}, \
+  { d,  e,  f,  g, -g, -f, -e, -d, -d, -e, -f, -g,  g,  f,  e,  d,  d,  e,  f,  g, -g, -f, -e, -d, -d, -e, -f, -g,  g,  f,  e,  d}, \
+  { r,  w,  B, -D, -y, -t, -p, -u, -z, -E,  A,  v,  q,  s,  x,  C, -C, -x, -s, -q, -v, -A,  E,  z,  u,  p,  t,  y,  D, -B, -w, -r}, \
+  { i,  l,  o, -m, -j, -h, -k, -n,  n,  k,  h,  j,  m, -o, -l, -i, -i, -l, -o,  m,  j,  h,  k,  n, -n, -k, -h, -j, -m,  o,  l,  i}, \
+  { s,  z, -D, -w, -p, -v, -C,  A,  t,  r,  y, -E, -x, -q, -u, -B,  B,  u,  q,  x,  E, -y, -r, -t, -A,  C,  v,  p,  w,  D, -z, -s}, \
+  { b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b,  b,  c, -c, -b, -b, -c,  c,  b}, \
+  { t,  C, -y, -p, -x,  D,  u,  s,  B, -z, -q, -w,  E,  v,  r,  A, -A, -r, -v, -E,  w,  q,  z, -B, -s, -u, -D,  x,  p,  y, -C, -t}, \
+  { j,  o, -k, -i, -n,  l,  h,  m, -m, -h, -l,  n,  i,  k, -o, -j, -j, -o,  k,  i,  n, -l, -h, -m,  m,  h,  l, -n, -i, -k,  o,  j}, \
+  { u, -E, -t, -v,  D,  s,  w, -C, -r, -x,  B,  q,  y, -A, -p, -z,  z,  p,  A, -y, -q, -B,  x,  r,  C, -w, -s, -D,  v,  t,  E, -u}, \
+  { e, -g, -d, -f,  f,  d,  g, -e, -e,  g,  d,  f, -f, -d, -g,  e,  e, -g, -d, -f,  f,  d,  g, -e, -e,  g,  d,  f, -f, -d, -g,  e}, \
+  { v, -B, -p, -C,  u,  w, -A, -q, -D,  t,  x, -z, -r, -E,  s,  y, -y, -s,  E,  r,  z, -x, -t,  D,  q,  A, -w, -u,  C,  p,  B, -v}, \
+  { k, -m, -i,  o,  h,  n, -j, -l,  l,  j, -n, -h, -o,  i,  m, -k, -k,  m,  i, -o, -h, -n,  j,  l, -l, -j,  n,  h,  o, -i, -m,  k}, \
+  { w, -y, -u,  A,  s, -C, -q,  E,  p,  D, -r, -B,  t,  z, -v, -x,  x,  v, -z, -t,  B,  r, -D, -p, -E,  q,  C, -s, -A,  u,  y, -w}, \
+  { a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a,  a, -a, -a,  a}, \
+  { x, -v, -z,  t,  B, -r, -D,  p, -E, -q,  C,  s, -A, -u,  y,  w, -w, -y,  u,  A, -s, -C,  q,  E, -p,  D,  r, -B, -t,  z,  v, -x}, \
+  { l, -j, -n,  h, -o, -i,  m,  k, -k, -m,  i,  o, -h,  n,  j, -l, -l,  j,  n, -h,  o,  i, -m, -k,  k,  m, -i, -o,  h, -n, -j,  l}, \
+  { y, -s, -E,  r, -z, -x,  t,  D, -q,  A,  w, -u, -C,  p, -B, -v,  v,  B, -p,  C,  u, -w, -A,  q, -D, -t,  x,  z, -r,  E,  s, -y}, \
+  { f, -d,  g,  e, -e, -g,  d, -f, -f,  d, -g, -e,  e,  g, -d,  f,  f, -d,  g,  e, -e, -g,  d, -f, -f,  d, -g, -e,  e,  g, -d,  f}, \
+  { z, -p,  A,  y, -q,  B,  x, -r,  C,  w, -s,  D,  v, -t,  E,  u, -u, -E,  t, -v, -D,  s, -w, -C,  r, -x, -B,  q, -y, -A,  p, -z}, \
+  { m, -h,  l,  n, -i,  k,  o, -j,  j, -o, -k,  i, -n, -l,  h, -m, -m,  h, -l, -n,  i, -k, -o,  j, -j,  o,  k, -i,  n,  l, -h,  m}, \
+  { A, -r,  v, -E, -w,  q, -z, -B,  s, -u,  D,  x, -p,  y,  C, -t,  t, -C, -y,  p, -x, -D,  u, -s,  B,  z, -q,  w,  E, -v,  r, -A}, \
+  { c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c,  c, -b,  b, -c, -c,  b, -b,  c}, \
+  { B, -u,  q, -x,  E,  y, -r,  t, -A, -C,  v, -p,  w, -D, -z,  s, -s,  z,  D, -w,  p, -v,  C,  A, -t,  r, -y, -E,  x, -q,  u, -B}, \
+  { n, -k,  h, -j,  m,  o, -l,  i, -i,  l, -o, -m,  j, -h,  k, -n, -n,  k, -h,  j, -m, -o,  l, -i,  i, -l,  o,  m, -j,  h, -k,  n}, \
+  { C, -x,  s, -q,  v, -A, -E,  z, -u,  p, -t,  y, -D, -B,  w, -r,  r, -w,  B,  D, -y,  t, -p,  u, -z,  E,  A, -v,  q, -s,  x, -C}, \
+  { g, -f,  e, -d,  d, -e,  f, -g, -g,  f, -e,  d, -d,  e, -f,  g,  g, -f,  e, -d,  d, -e,  f, -g, -g,  f, -e,  d, -d,  e, -f,  g}, \
+  { D, -A,  x, -u,  r, -p,  s, -v,  y, -B,  E,  C, -z,  w, -t,  q, -q,  t, -w,  z, -C, -E,  B, -y,  v, -s,  p, -r,  u, -x,  A, -D}, \
+  { o, -n,  m, -l,  k, -j,  i, -h,  h, -i,  j, -k,  l, -m,  n, -o, -o,  n, -m,  l, -k,  j, -i,  h, -h,  i, -j,  k, -l,  m, -n,  o}, \
+  { E, -D,  C, -B,  A, -z,  y, -x,  w, -v,  u, -t,  s, -r,  q, -p,  p, -q,  r, -s,  t, -u,  v, -w,  x, -y,  z, -A,  B, -C,  D, -E}  \
+}
+
+
+#define DEFINE_DCT2_P64_MATRIX(aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl, bm, bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck) \
+{ \
+  { aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa,  aa }, \
+  { bf,  bg,  bh,  bi,  bj,  bk,  bl,  bm,  bn,  bo,  bp,  bq,  br,  bs,  bt,  bu,  bv,  bw,  bx,  by,  bz,  ca,  cb,  cc,  cd,  ce,  cf,  cg,  ch,  ci,  cj,  ck, -ck, -cj, -ci, -ch, -cg, -cf, -ce, -cd, -cc, -cb, -ca, -bz, -by, -bx, -bw, -bv, -bu, -bt, -bs, -br, -bq, -bp, -bo, -bn, -bm, -bl, -bk, -bj, -bi, -bh, -bg, -bf }, \
+  { ap,  aq,  ar,  as,  at,  au,  av,  aw,  ax,  ay,  az,  ba,  bb,  bc,  bd,  be, -be, -bd, -bc, -bb, -ba, -az, -ay, -ax, -aw, -av, -au, -at, -as, -ar, -aq, -ap, -ap, -aq, -ar, -as, -at, -au, -av, -aw, -ax, -ay, -az, -ba, -bb, -bc, -bd, -be,  be,  bd,  bc,  bb,  ba,  az,  ay,  ax,  aw,  av,  au,  at,  as,  ar,  aq,  ap }, \
+  { bg,  bj,  bm,  bp,  bs,  bv,  by,  cb,  ce,  ch,  ck, -ci, -cf, -cc, -bz, -bw, -bt, -bq, -bn, -bk, -bh, -bf, -bi, -bl, -bo, -br, -bu, -bx, -ca, -cd, -cg, -cj,  cj,  cg,  cd,  ca,  bx,  bu,  br,  bo,  bl,  bi,  bf,  bh,  bk,  bn,  bq,  bt,  bw,  bz,  cc,  cf,  ci, -ck, -ch, -ce, -cb, -by, -bv, -bs, -bp, -bm, -bj, -bg }, \
+  { ah,  ai,  aj,  ak,  al,  am,  an,  ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao,  ao,  an,  am,  al,  ak,  aj,  ai,  ah,  ah,  ai,  aj,  ak,  al,  am,  an,  ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao,  ao,  an,  am,  al,  ak,  aj,  ai,  ah }, \
+  { bh,  bm,  br,  bw,  cb,  cg, -ck, -cf, -ca, -bv, -bq, -bl, -bg, -bi, -bn, -bs, -bx, -cc, -ch,  cj,  ce,  bz,  bu,  bp,  bk,  bf,  bj,  bo,  bt,  by,  cd,  ci, -ci, -cd, -by, -bt, -bo, -bj, -bf, -bk, -bp, -bu, -bz, -ce, -cj,  ch,  cc,  bx,  bs,  bn,  bi,  bg,  bl,  bq,  bv,  ca,  cf,  ck, -cg, -cb, -bw, -br, -bm, -bh }, \
+  { aq,  at,  aw,  az,  bc, -be, -bb, -ay, -av, -as, -ap, -ar, -au, -ax, -ba, -bd,  bd,  ba,  ax,  au,  ar,  ap,  as,  av,  ay,  bb,  be, -bc, -az, -aw, -at, -aq, -aq, -at, -aw, -az, -bc,  be,  bb,  ay,  av,  as,  ap,  ar,  au,  ax,  ba,  bd, -bd, -ba, -ax, -au, -ar, -ap, -as, -av, -ay, -bb, -be,  bc,  az,  aw,  at,  aq }, \
+  { bi,  bp,  bw,  cd,  ck, -ce, -bx, -bq, -bj, -bh, -bo, -bv, -cc, -cj,  cf,  by,  br,  bk,  bg,  bn,  bu,  cb,  ci, -cg, -bz, -bs, -bl, -bf, -bm, -bt, -ca, -ch,  ch,  ca,  bt,  bm,  bf,  bl,  bs,  bz,  cg, -ci, -cb, -bu, -bn, -bg, -bk, -br, -by, -cf,  cj,  cc,  bv,  bo,  bh,  bj,  bq,  bx,  ce, -ck, -cd, -bw, -bp, -bi }, \
+  { ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad,  ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad,  ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad,  ad,  ae,  af,  ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag,  ag,  af,  ae,  ad }, \
+  { bj,  bs,  cb,  ck, -cc, -bt, -bk, -bi, -br, -ca, -cj,  cd,  bu,  bl,  bh,  bq,  bz,  ci, -ce, -bv, -bm, -bg, -bp, -by, -ch,  cf,  bw,  bn,  bf,  bo,  bx,  cg, -cg, -bx, -bo, -bf, -bn, -bw, -cf,  ch,  by,  bp,  bg,  bm,  bv,  ce, -ci, -bz, -bq, -bh, -bl, -bu, -cd,  cj,  ca,  br,  bi,  bk,  bt,  cc, -ck, -cb, -bs, -bj }, \
+  { ar,  aw,  bb, -bd, -ay, -at, -ap, -au, -az, -be,  ba,  av,  aq,  as,  ax,  bc, -bc, -ax, -as, -aq, -av, -ba,  be,  az,  au,  ap,  at,  ay,  bd, -bb, -aw, -ar, -ar, -aw, -bb,  bd,  ay,  at,  ap,  au,  az,  be, -ba, -av, -aq, -as, -ax, -bc,  bc,  ax,  as,  aq,  av,  ba, -be, -az, -au, -ap, -at, -ay, -bd,  bb,  aw,  ar }, \
+  { bk,  bv,  cg, -ce, -bt, -bi, -bm, -bx, -ci,  cc,  br,  bg,  bo,  bz,  ck, -ca, -bp, -bf, -bq, -cb,  cj,  by,  bn,  bh,  bs,  cd, -ch, -bw, -bl, -bj, -bu, -cf,  cf,  bu,  bj,  bl,  bw,  ch, -cd, -bs, -bh, -bn, -by, -cj,  cb,  bq,  bf,  bp,  ca, -ck, -bz, -bo, -bg, -br, -cc,  ci,  bx,  bm,  bi,  bt,  ce, -cg, -bv, -bk }, \
+  { ai,  al,  ao, -am, -aj, -ah, -ak, -an,  an,  ak,  ah,  aj,  am, -ao, -al, -ai, -ai, -al, -ao,  am,  aj,  ah,  ak,  an, -an, -ak, -ah, -aj, -am,  ao,  al,  ai,  ai,  al,  ao, -am, -aj, -ah, -ak, -an,  an,  ak,  ah,  aj,  am, -ao, -al, -ai, -ai, -al, -ao,  am,  aj,  ah,  ak,  an, -an, -ak, -ah, -aj, -am,  ao,  al,  ai }, \
+  { bl,  by, -ck, -bx, -bk, -bm, -bz,  cj,  bw,  bj,  bn,  ca, -ci, -bv, -bi, -bo, -cb,  ch,  bu,  bh,  bp,  cc, -cg, -bt, -bg, -bq, -cd,  cf,  bs,  bf,  br,  ce, -ce, -br, -bf, -bs, -cf,  cd,  bq,  bg,  bt,  cg, -cc, -bp, -bh, -bu, -ch,  cb,  bo,  bi,  bv,  ci, -ca, -bn, -bj, -bw, -cj,  bz,  bm,  bk,  bx,  ck, -by, -bl }, \
+  { as,  az, -bd, -aw, -ap, -av, -bc,  ba,  at,  ar,  ay, -be, -ax, -aq, -au, -bb,  bb,  au,  aq,  ax,  be, -ay, -ar, -at, -ba,  bc,  av,  ap,  aw,  bd, -az, -as, -as, -az,  bd,  aw,  ap,  av,  bc, -ba, -at, -ar, -ay,  be,  ax,  aq,  au,  bb, -bb, -au, -aq, -ax, -be,  ay,  ar,  at,  ba, -bc, -av, -ap, -aw, -bd,  az,  as }, \
+  { bm,  cb, -cf, -bq, -bi, -bx,  cj,  bu,  bf,  bt,  ci, -by, -bj, -bp, -ce,  cc,  bn,  bl,  ca, -cg, -br, -bh, -bw,  ck,  bv,  bg,  bs,  ch, -bz, -bk, -bo, -cd,  cd,  bo,  bk,  bz, -ch, -bs, -bg, -bv, -ck,  bw,  bh,  br,  cg, -ca, -bl, -bn, -cc,  ce,  bp,  bj,  by, -ci, -bt, -bf, -bu, -cj,  bx,  bi,  bq,  cf, -cb, -bm }, \
+  { ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab,  ab,  ac, -ac, -ab, -ab, -ac,  ac,  ab }, \
+  { bn,  ce, -ca, -bj, -br, -ci,  bw,  bf,  bv, -cj, -bs, -bi, -bz,  cf,  bo,  bm,  cd, -cb, -bk, -bq, -ch,  bx,  bg,  bu, -ck, -bt, -bh, -by,  cg,  bp,  bl,  cc, -cc, -bl, -bp, -cg,  by,  bh,  bt,  ck, -bu, -bg, -bx,  ch,  bq,  bk,  cb, -cd, -bm, -bo, -cf,  bz,  bi,  bs,  cj, -bv, -bf, -bw,  ci,  br,  bj,  ca, -ce, -bn }, \
+  { at,  bc, -ay, -ap, -ax,  bd,  au,  as,  bb, -az, -aq, -aw,  be,  av,  ar,  ba, -ba, -ar, -av, -be,  aw,  aq,  az, -bb, -as, -au, -bd,  ax,  ap,  ay, -bc, -at, -at, -bc,  ay,  ap,  ax, -bd, -au, -as, -bb,  az,  aq,  aw, -be, -av, -ar, -ba,  ba,  ar,  av,  be, -aw, -aq, -az,  bb,  as,  au,  bd, -ax, -ap, -ay,  bc,  at }, \
+  { bo,  ch, -bv, -bh, -ca,  cc,  bj,  bt, -cj, -bq, -bm, -cf,  bx,  bf,  by, -ce, -bl, -br, -ck,  bs,  bk,  cd, -bz, -bg, -bw,  cg,  bn,  bp,  ci, -bu, -bi, -cb,  cb,  bi,  bu, -ci, -bp, -bn, -cg,  bw,  bg,  bz, -cd, -bk, -bs,  ck,  br,  bl,  ce, -by, -bf, -bx,  cf,  bm,  bq,  cj, -bt, -bj, -cc,  ca,  bh,  bv, -ch, -bo }, \
+  { aj,  ao, -ak, -ai, -an,  al,  ah,  am, -am, -ah, -al,  an,  ai,  ak, -ao, -aj, -aj, -ao,  ak,  ai,  an, -al, -ah, -am,  am,  ah,  al, -an, -ai, -ak,  ao,  aj,  aj,  ao, -ak, -ai, -an,  al,  ah,  am, -am, -ah, -al,  an,  ai,  ak, -ao, -aj, -aj, -ao,  ak,  ai,  an, -al, -ah, -am,  am,  ah,  al, -an, -ai, -ak,  ao,  aj }, \
+  { bp,  ck, -bq, -bo, -cj,  br,  bn,  ci, -bs, -bm, -ch,  bt,  bl,  cg, -bu, -bk, -cf,  bv,  bj,  ce, -bw, -bi, -cd,  bx,  bh,  cc, -by, -bg, -cb,  bz,  bf,  ca, -ca, -bf, -bz,  cb,  bg,  by, -cc, -bh, -bx,  cd,  bi,  bw, -ce, -bj, -bv,  cf,  bk,  bu, -cg, -bl, -bt,  ch,  bm,  bs, -ci, -bn, -br,  cj,  bo,  bq, -ck, -bp }, \
+  { au, -be, -at, -av,  bd,  as,  aw, -bc, -ar, -ax,  bb,  aq,  ay, -ba, -ap, -az,  az,  ap,  ba, -ay, -aq, -bb,  ax,  ar,  bc, -aw, -as, -bd,  av,  at,  be, -au, -au,  be,  at,  av, -bd, -as, -aw,  bc,  ar,  ax, -bb, -aq, -ay,  ba,  ap,  az, -az, -ap, -ba,  ay,  aq,  bb, -ax, -ar, -bc,  aw,  as,  bd, -av, -at, -be,  au }, \
+  { bq, -ci, -bl, -bv,  cd,  bg,  ca, -by, -bi, -cf,  bt,  bn,  ck, -bo, -bs,  cg,  bj,  bx, -cb, -bf, -cc,  bw,  bk,  ch, -br, -bp,  cj,  bm,  bu, -ce, -bh, -bz,  bz,  bh,  ce, -bu, -bm, -cj,  bp,  br, -ch, -bk, -bw,  cc,  bf,  cb, -bx, -bj, -cg,  bs,  bo, -ck, -bn, -bt,  cf,  bi,  by, -ca, -bg, -cd,  bv,  bl,  ci, -bq }, \
+  { ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae,  ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae,  ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae,  ae, -ag, -ad, -af,  af,  ad,  ag, -ae, -ae,  ag,  ad,  af, -af, -ad, -ag,  ae }, \
+  { br, -cf, -bg, -cc,  bu,  bo, -ci, -bj, -bz,  bx,  bl,  ck, -bm, -bw,  ca,  bi,  ch, -bp, -bt,  cd,  bf,  ce, -bs, -bq,  cg,  bh,  cb, -bv, -bn,  cj,  bk,  by, -by, -bk, -cj,  bn,  bv, -cb, -bh, -cg,  bq,  bs, -ce, -bf, -cd,  bt,  bp, -ch, -bi, -ca,  bw,  bm, -ck, -bl, -bx,  bz,  bj,  ci, -bo, -bu,  cc,  bg,  cf, -br }, \
+  { av, -bb, -ap, -bc,  au,  aw, -ba, -aq, -bd,  at,  ax, -az, -ar, -be,  as,  ay, -ay, -as,  be,  ar,  az, -ax, -at,  bd,  aq,  ba, -aw, -au,  bc,  ap,  bb, -av, -av,  bb,  ap,  bc, -au, -aw,  ba,  aq,  bd, -at, -ax,  az,  ar,  be, -as, -ay,  ay,  as, -be, -ar, -az,  ax,  at, -bd, -aq, -ba,  aw,  au, -bc, -ap, -bb,  av }, \
+  { bs, -cc, -bi, -cj,  bl,  bz, -bv, -bp,  cf,  bf,  cg, -bo, -bw,  by,  bm, -ci, -bh, -cd,  br,  bt, -cb, -bj, -ck,  bk,  ca, -bu, -bq,  ce,  bg,  ch, -bn, -bx,  bx,  bn, -ch, -bg, -ce,  bq,  bu, -ca, -bk,  ck,  bj,  cb, -bt, -br,  cd,  bh,  ci, -bm, -by,  bw,  bo, -cg, -bf, -cf,  bp,  bv, -bz, -bl,  cj,  bi,  cc, -bs }, \
+  { ak, -am, -ai,  ao,  ah,  an, -aj, -al,  al,  aj, -an, -ah, -ao,  ai,  am, -ak, -ak,  am,  ai, -ao, -ah, -an,  aj,  al, -al, -aj,  an,  ah,  ao, -ai, -am,  ak,  ak, -am, -ai,  ao,  ah,  an, -aj, -al,  al,  aj, -an, -ah, -ao,  ai,  am, -ak, -ak,  am,  ai, -ao, -ah, -an,  aj,  al, -al, -aj,  an,  ah,  ao, -ai, -am,  ak }, \
+  { bt, -bz, -bn,  cf,  bh,  ck, -bi, -ce,  bo,  by, -bu, -bs,  ca,  bm, -cg, -bg, -cj,  bj,  cd, -bp, -bx,  bv,  br, -cb, -bl,  ch,  bf,  ci, -bk, -cc,  bq,  bw, -bw, -bq,  cc,  bk, -ci, -bf, -ch,  bl,  cb, -br, -bv,  bx,  bp, -cd, -bj,  cj,  bg,  cg, -bm, -ca,  bs,  bu, -by, -bo,  ce,  bi, -ck, -bh, -cf,  bn,  bz, -bt }, \
+  { aw, -ay, -au,  ba,  as, -bc, -aq,  be,  ap,  bd, -ar, -bb,  at,  az, -av, -ax,  ax,  av, -az, -at,  bb,  ar, -bd, -ap, -be,  aq,  bc, -as, -ba,  au,  ay, -aw, -aw,  ay,  au, -ba, -as,  bc,  aq, -be, -ap, -bd,  ar,  bb, -at, -az,  av,  ax, -ax, -av,  az,  at, -bb, -ar,  bd,  ap,  be, -aq, -bc,  as,  ba, -au, -ay,  aw }, \
+  { bu, -bw, -bs,  by,  bq, -ca, -bo,  cc,  bm, -ce, -bk,  cg,  bi, -ci, -bg,  ck,  bf,  cj, -bh, -ch,  bj,  cf, -bl, -cd,  bn,  cb, -bp, -bz,  br,  bx, -bt, -bv,  bv,  bt, -bx, -br,  bz,  bp, -cb, -bn,  cd,  bl, -cf, -bj,  ch,  bh, -cj, -bf, -ck,  bg,  ci, -bi, -cg,  bk,  ce, -bm, -cc,  bo,  ca, -bq, -by,  bs,  bw, -bu }, \
+  { aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa,  aa, -aa, -aa,  aa }, \
+  { bv, -bt, -bx,  br,  bz, -bp, -cb,  bn,  cd, -bl, -cf,  bj,  ch, -bh, -cj,  bf, -ck, -bg,  ci,  bi, -cg, -bk,  ce,  bm, -cc, -bo,  ca,  bq, -by, -bs,  bw,  bu, -bu, -bw,  bs,  by, -bq, -ca,  bo,  cc, -bm, -ce,  bk,  cg, -bi, -ci,  bg,  ck, -bf,  cj,  bh, -ch, -bj,  cf,  bl, -cd, -bn,  cb,  bp, -bz, -br,  bx,  bt, -bv }, \
+  { ax, -av, -az,  at,  bb, -ar, -bd,  ap, -be, -aq,  bc,  as, -ba, -au,  ay,  aw, -aw, -ay,  au,  ba, -as, -bc,  aq,  be, -ap,  bd,  ar, -bb, -at,  az,  av, -ax, -ax,  av,  az, -at, -bb,  ar,  bd, -ap,  be,  aq, -bc, -as,  ba,  au, -ay, -aw,  aw,  ay, -au, -ba,  as,  bc, -aq, -be,  ap, -bd, -ar,  bb,  at, -az, -av,  ax }, \
+  { bw, -bq, -cc,  bk,  ci, -bf,  ch,  bl, -cb, -br,  bv,  bx, -bp, -cd,  bj,  cj, -bg,  cg,  bm, -ca, -bs,  bu,  by, -bo, -ce,  bi,  ck, -bh,  cf,  bn, -bz, -bt,  bt,  bz, -bn, -cf,  bh, -ck, -bi,  ce,  bo, -by, -bu,  bs,  ca, -bm, -cg,  bg, -cj, -bj,  cd,  bp, -bx, -bv,  br,  cb, -bl, -ch,  bf, -ci, -bk,  cc,  bq, -bw }, \
+  { al, -aj, -an,  ah, -ao, -ai,  am,  ak, -ak, -am,  ai,  ao, -ah,  an,  aj, -al, -al,  aj,  an, -ah,  ao,  ai, -am, -ak,  ak,  am, -ai, -ao,  ah, -an, -aj,  al,  al, -aj, -an,  ah, -ao, -ai,  am,  ak, -ak, -am,  ai,  ao, -ah,  an,  aj, -al, -al,  aj,  an, -ah,  ao,  ai, -am, -ak,  ak,  am, -ai, -ao,  ah, -an, -aj,  al }, \
+  { bx, -bn, -ch,  bg, -ce, -bq,  bu,  ca, -bk, -ck,  bj, -cb, -bt,  br,  cd, -bh,  ci,  bm, -by, -bw,  bo,  cg, -bf,  cf,  bp, -bv, -bz,  bl,  cj, -bi,  cc,  bs, -bs, -cc,  bi, -cj, -bl,  bz,  bv, -bp, -cf,  bf, -cg, -bo,  bw,  by, -bm, -ci,  bh, -cd, -br,  bt,  cb, -bj,  ck,  bk, -ca, -bu,  bq,  ce, -bg,  ch,  bn, -bx }, \
+  { ay, -as, -be,  ar, -az, -ax,  at,  bd, -aq,  ba,  aw, -au, -bc,  ap, -bb, -av,  av,  bb, -ap,  bc,  au, -aw, -ba,  aq, -bd, -at,  ax,  az, -ar,  be,  as, -ay, -ay,  as,  be, -ar,  az,  ax, -at, -bd,  aq, -ba, -aw,  au,  bc, -ap,  bb,  av, -av, -bb,  ap, -bc, -au,  aw,  ba, -aq,  bd,  at, -ax, -az,  ar, -be, -as,  ay }, \
+  { by, -bk,  cj,  bn, -bv, -cb,  bh, -cg, -bq,  bs,  ce, -bf,  cd,  bt, -bp, -ch,  bi, -ca, -bw,  bm,  ck, -bl,  bx,  bz, -bj,  ci,  bo, -bu, -cc,  bg, -cf, -br,  br,  cf, -bg,  cc,  bu, -bo, -ci,  bj, -bz, -bx,  bl, -ck, -bm,  bw,  ca, -bi,  ch,  bp, -bt, -cd,  bf, -ce, -bs,  bq,  cg, -bh,  cb,  bv, -bn, -cj,  bk, -by }, \
+  { af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af,  af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af,  af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af,  af, -ad,  ag,  ae, -ae, -ag,  ad, -af, -af,  ad, -ag, -ae,  ae,  ag, -ad,  af }, \
+  { bz, -bh,  ce,  bu, -bm,  cj,  bp, -br, -ch,  bk, -bw, -cc,  bf, -cb, -bx,  bj, -cg, -bs,  bo,  ck, -bn,  bt,  cf, -bi,  by,  ca, -bg,  cd,  bv, -bl,  ci,  bq, -bq, -ci,  bl, -bv, -cd,  bg, -ca, -by,  bi, -cf, -bt,  bn, -ck, -bo,  bs,  cg, -bj,  bx,  cb, -bf,  cc,  bw, -bk,  ch,  br, -bp, -cj,  bm, -bu, -ce,  bh, -bz }, \
+  { az, -ap,  ba,  ay, -aq,  bb,  ax, -ar,  bc,  aw, -as,  bd,  av, -at,  be,  au, -au, -be,  at, -av, -bd,  as, -aw, -bc,  ar, -ax, -bb,  aq, -ay, -ba,  ap, -az, -az,  ap, -ba, -ay,  aq, -bb, -ax,  ar, -bc, -aw,  as, -bd, -av,  at, -be, -au,  au,  be, -at,  av,  bd, -as,  aw,  bc, -ar,  ax,  bb, -aq,  ay,  ba, -ap,  az }, \
+  { ca, -bf,  bz,  cb, -bg,  by,  cc, -bh,  bx,  cd, -bi,  bw,  ce, -bj,  bv,  cf, -bk,  bu,  cg, -bl,  bt,  ch, -bm,  bs,  ci, -bn,  br,  cj, -bo,  bq,  ck, -bp,  bp, -ck, -bq,  bo, -cj, -br,  bn, -ci, -bs,  bm, -ch, -bt,  bl, -cg, -bu,  bk, -cf, -bv,  bj, -ce, -bw,  bi, -cd, -bx,  bh, -cc, -by,  bg, -cb, -bz,  bf, -ca }, \
+  { am, -ah,  al,  an, -ai,  ak,  ao, -aj,  aj, -ao, -ak,  ai, -an, -al,  ah, -am, -am,  ah, -al, -an,  ai, -ak, -ao,  aj, -aj,  ao,  ak, -ai,  an,  al, -ah,  am,  am, -ah,  al,  an, -ai,  ak,  ao, -aj,  aj, -ao, -ak,  ai, -an, -al,  ah, -am, -am,  ah, -al, -an,  ai, -ak, -ao,  aj, -aj,  ao,  ak, -ai,  an,  al, -ah,  am }, \
+  { cb, -bi,  bu,  ci, -bp,  bn, -cg, -bw,  bg, -bz, -cd,  bk, -bs, -ck,  br, -bl,  ce,  by, -bf,  bx,  cf, -bm,  bq, -cj, -bt,  bj, -cc, -ca,  bh, -bv, -ch,  bo, -bo,  ch,  bv, -bh,  ca,  cc, -bj,  bt,  cj, -bq,  bm, -cf, -bx,  bf, -by, -ce,  bl, -br,  ck,  bs, -bk,  cd,  bz, -bg,  bw,  cg, -bn,  bp, -ci, -bu,  bi, -cb }, \
+  { ba, -ar,  av, -be, -aw,  aq, -az, -bb,  as, -au,  bd,  ax, -ap,  ay,  bc, -at,  at, -bc, -ay,  ap, -ax, -bd,  au, -as,  bb,  az, -aq,  aw,  be, -av,  ar, -ba, -ba,  ar, -av,  be,  aw, -aq,  az,  bb, -as,  au, -bd, -ax,  ap, -ay, -bc,  at, -at,  bc,  ay, -ap,  ax,  bd, -au,  as, -bb, -az,  aq, -aw, -be,  av, -ar,  ba }, \
+  { cc, -bl,  bp, -cg, -by,  bh, -bt,  ck,  bu, -bg,  bx,  ch, -bq,  bk, -cb, -cd,  bm, -bo,  cf,  bz, -bi,  bs, -cj, -bv,  bf, -bw, -ci,  br, -bj,  ca,  ce, -bn,  bn, -ce, -ca,  bj, -br,  ci,  bw, -bf,  bv,  cj, -bs,  bi, -bz, -cf,  bo, -bm,  cd,  cb, -bk,  bq, -ch, -bx,  bg, -bu, -ck,  bt, -bh,  by,  cg, -bp,  bl, -cc }, \
+  { ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac,  ac, -ab,  ab, -ac, -ac,  ab, -ab,  ac }, \
+  { cd, -bo,  bk, -bz, -ch,  bs, -bg,  bv, -ck, -bw,  bh, -br,  cg,  ca, -bl,  bn, -cc, -ce,  bp, -bj,  by,  ci, -bt,  bf, -bu,  cj,  bx, -bi,  bq, -cf, -cb,  bm, -bm,  cb,  cf, -bq,  bi, -bx, -cj,  bu, -bf,  bt, -ci, -by,  bj, -bp,  ce,  cc, -bn,  bl, -ca, -cg,  br, -bh,  bw,  ck, -bv,  bg, -bs,  ch,  bz, -bk,  bo, -cd }, \
+  { bb, -au,  aq, -ax,  be,  ay, -ar,  at, -ba, -bc,  av, -ap,  aw, -bd, -az,  as, -as,  az,  bd, -aw,  ap, -av,  bc,  ba, -at,  ar, -ay, -be,  ax, -aq,  au, -bb, -bb,  au, -aq,  ax, -be, -ay,  ar, -at,  ba,  bc, -av,  ap, -aw,  bd,  az, -as,  as, -az, -bd,  aw, -ap,  av, -bc, -ba,  at, -ar,  ay,  be, -ax,  aq, -au,  bb }, \
+  { ce, -br,  bf, -bs,  cf,  cd, -bq,  bg, -bt,  cg,  cc, -bp,  bh, -bu,  ch,  cb, -bo,  bi, -bv,  ci,  ca, -bn,  bj, -bw,  cj,  bz, -bm,  bk, -bx,  ck,  by, -bl,  bl, -by, -ck,  bx, -bk,  bm, -bz, -cj,  bw, -bj,  bn, -ca, -ci,  bv, -bi,  bo, -cb, -ch,  bu, -bh,  bp, -cc, -cg,  bt, -bg,  bq, -cd, -cf,  bs, -bf,  br, -ce }, \
+  { an, -ak,  ah, -aj,  am,  ao, -al,  ai, -ai,  al, -ao, -am,  aj, -ah,  ak, -an, -an,  ak, -ah,  aj, -am, -ao,  al, -ai,  ai, -al,  ao,  am, -aj,  ah, -ak,  an,  an, -ak,  ah, -aj,  am,  ao, -al,  ai, -ai,  al, -ao, -am,  aj, -ah,  ak, -an, -an,  ak, -ah,  aj, -am, -ao,  al, -ai,  ai, -al,  ao,  am, -aj,  ah, -ak,  an }, \
+  { cf, -bu,  bj, -bl,  bw, -ch, -cd,  bs, -bh,  bn, -by,  cj,  cb, -bq,  bf, -bp,  ca,  ck, -bz,  bo, -bg,  br, -cc, -ci,  bx, -bm,  bi, -bt,  ce,  cg, -bv,  bk, -bk,  bv, -cg, -ce,  bt, -bi,  bm, -bx,  ci,  cc, -br,  bg, -bo,  bz, -ck, -ca,  bp, -bf,  bq, -cb, -cj,  by, -bn,  bh, -bs,  cd,  ch, -bw,  bl, -bj,  bu, -cf }, \
+  { bc, -ax,  as, -aq,  av, -ba, -be,  az, -au,  ap, -at,  ay, -bd, -bb,  aw, -ar,  ar, -aw,  bb,  bd, -ay,  at, -ap,  au, -az,  be,  ba, -av,  aq, -as,  ax, -bc, -bc,  ax, -as,  aq, -av,  ba,  be, -az,  au, -ap,  at, -ay,  bd,  bb, -aw,  ar, -ar,  aw, -bb, -bd,  ay, -at,  ap, -au,  az, -be, -ba,  av, -aq,  as, -ax,  bc }, \
+  { cg, -bx,  bo, -bf,  bn, -bw,  cf,  ch, -by,  bp, -bg,  bm, -bv,  ce,  ci, -bz,  bq, -bh,  bl, -bu,  cd,  cj, -ca,  br, -bi,  bk, -bt,  cc,  ck, -cb,  bs, -bj,  bj, -bs,  cb, -ck, -cc,  bt, -bk,  bi, -br,  ca, -cj, -cd,  bu, -bl,  bh, -bq,  bz, -ci, -ce,  bv, -bm,  bg, -bp,  by, -ch, -cf,  bw, -bn,  bf, -bo,  bx, -cg }, \
+  { ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag,  ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag,  ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag,  ag, -af,  ae, -ad,  ad, -ae,  af, -ag, -ag,  af, -ae,  ad, -ad,  ae, -af,  ag }, \
+  { ch, -ca,  bt, -bm,  bf, -bl,  bs, -bz,  cg,  ci, -cb,  bu, -bn,  bg, -bk,  br, -by,  cf,  cj, -cc,  bv, -bo,  bh, -bj,  bq, -bx,  ce,  ck, -cd,  bw, -bp,  bi, -bi,  bp, -bw,  cd, -ck, -ce,  bx, -bq,  bj, -bh,  bo, -bv,  cc, -cj, -cf,  by, -br,  bk, -bg,  bn, -bu,  cb, -ci, -cg,  bz, -bs,  bl, -bf,  bm, -bt,  ca, -ch }, \
+  { bd, -ba,  ax, -au,  ar, -ap,  as, -av,  ay, -bb,  be,  bc, -az,  aw, -at,  aq, -aq,  at, -aw,  az, -bc, -be,  bb, -ay,  av, -as,  ap, -ar,  au, -ax,  ba, -bd, -bd,  ba, -ax,  au, -ar,  ap, -as,  av, -ay,  bb, -be, -bc,  az, -aw,  at, -aq,  aq, -at,  aw, -az,  bc,  be, -bb,  ay, -av,  as, -ap,  ar, -au,  ax, -ba,  bd }, \
+  { ci, -cd,  by, -bt,  bo, -bj,  bf, -bk,  bp, -bu,  bz, -ce,  cj,  ch, -cc,  bx, -bs,  bn, -bi,  bg, -bl,  bq, -bv,  ca, -cf,  ck,  cg, -cb,  bw, -br,  bm, -bh,  bh, -bm,  br, -bw,  cb, -cg, -ck,  cf, -ca,  bv, -bq,  bl, -bg,  bi, -bn,  bs, -bx,  cc, -ch, -cj,  ce, -bz,  bu, -bp,  bk, -bf,  bj, -bo,  bt, -by,  cd, -ci }, \
+  { ao, -an,  am, -al,  ak, -aj,  ai, -ah,  ah, -ai,  aj, -ak,  al, -am,  an, -ao, -ao,  an, -am,  al, -ak,  aj, -ai,  ah, -ah,  ai, -aj,  ak, -al,  am, -an,  ao,  ao, -an,  am, -al,  ak, -aj,  ai, -ah,  ah, -ai,  aj, -ak,  al, -am,  an, -ao, -ao,  an, -am,  al, -ak,  aj, -ai,  ah, -ah,  ai, -aj,  ak, -al,  am, -an,  ao }, \
+  { cj, -cg,  cd, -ca,  bx, -bu,  br, -bo,  bl, -bi,  bf, -bh,  bk, -bn,  bq, -bt,  bw, -bz,  cc, -cf,  ci,  ck, -ch,  ce, -cb,  by, -bv,  bs, -bp,  bm, -bj,  bg, -bg,  bj, -bm,  bp, -bs,  bv, -by,  cb, -ce,  ch, -ck, -ci,  cf, -cc,  bz, -bw,  bt, -bq,  bn, -bk,  bh, -bf,  bi, -bl,  bo, -br,  bu, -bx,  ca, -cd,  cg, -cj }, \
+  { be, -bd,  bc, -bb,  ba, -az,  ay, -ax,  aw, -av,  au, -at,  as, -ar,  aq, -ap,  ap, -aq,  ar, -as,  at, -au,  av, -aw,  ax, -ay,  az, -ba,  bb, -bc,  bd, -be, -be,  bd, -bc,  bb, -ba,  az, -ay,  ax, -aw,  av, -au,  at, -as,  ar, -aq,  ap, -ap,  aq, -ar,  as, -at,  au, -av,  aw, -ax,  ay, -az,  ba, -bb,  bc, -bd,  be }, \
+  { ck, -cj,  ci, -ch,  cg, -cf,  ce, -cd,  cc, -cb,  ca, -bz,  by, -bx,  bw, -bv,  bu, -bt,  bs, -br,  bq, -bp,  bo, -bn,  bm, -bl,  bk, -bj,  bi, -bh,  bg, -bf,  bf, -bg,  bh, -bi,  bj, -bk,  bl, -bm,  bn, -bo,  bp, -bq,  br, -bs,  bt, -bu,  bv, -bw,  bx, -by,  bz, -ca,  cb, -cc,  cd, -ce,  cf, -cg,  ch, -ci,  cj, -ck }, \
+ }
+
+// DCT-8
+#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \
+{ \
+  {  a,  b,  c,  d,}, \
+  {  b,  0, -b, -b,}, \
+  {  c, -b, -d,  a,}, \
+  {  d, -b,  a, -c,}, \
+}
+
+#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,}, \
+  {  b,  e,  h, -g, -d, -a, -c, -f,}, \
+  {  c,  h, -e, -a, -f,  g,  b,  d,}, \
+  {  d, -g, -a, -h,  c,  e, -f, -b,}, \
+  {  e, -d, -f,  c,  g, -b, -h,  a,}, \
+  {  f, -a,  g,  e, -b,  h,  d, -c,}, \
+  {  g, -c,  b, -f, -h,  d, -a,  e,}, \
+  {  h, -f,  d, -b,  a, -c,  e, -g,}, \
+}
+
+#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,}, \
+  {  b,  e,  h,  k,  n,  0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n,}, \
+  {  c,  h,  m, -p, -k, -f, -a, -e, -j, -o,  n,  i,  d,  b,  g,  l,}, \
+  {  d,  k, -p, -i, -b, -f, -m,  n,  g,  a,  h,  o, -l, -e, -c, -j,}, \
+  {  e,  n, -k, -b, -h,  0,  h,  b,  k, -n, -e, -e, -n,  k,  b,  h,}, \
+  {  f,  0, -f, -f,  0,  f,  f,  0, -f, -f,  0,  f,  f,  0, -f, -f,}, \
+  {  g, -n, -a, -m,  h,  f, -o, -b, -l,  i,  e, -p, -c, -k,  j,  d,}, \
+  {  h, -k, -e,  n,  b,  0, -b, -n,  e,  k, -h, -h,  k,  e, -n, -b,}, \
+  {  i, -h, -j,  g,  k, -f, -l,  e,  m, -d, -n,  c,  o, -b, -p,  a,}, \
+  {  j, -e, -o,  a, -n, -f,  i,  k, -d, -p,  b, -m, -g,  h,  l, -c,}, \
+  {  k, -b,  n,  h, -e,  0,  e, -h, -n,  b, -k, -k,  b, -n, -h,  e,}, \
+  {  l, -b,  i,  o, -e,  f, -p, -h,  c, -m, -k,  a, -j, -n,  d, -g,}, \
+  {  m, -e,  d, -l, -n,  f, -c,  k,  o, -g,  b, -j, -p,  h, -a,  i,}, \
+  {  n, -h,  b, -e,  k,  0, -k,  e, -b,  h, -n, -n,  h, -b,  e, -k,}, \
+  {  o, -k,  g, -c,  b, -f,  j, -n, -p,  l, -h,  d, -a,  e, -i,  m,}, \
+  {  p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o,}, \
+}
+
+#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F,}, \
+  {  b,  e,  h,  k,  n,  q,  t,  w,  z,  C,  F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D,}, \
+  {  c,  h,  m,  r,  w,  B,  0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B,  0,  B,  w,  r,  m,  h,  c,  c,  h,  m,  r,  w,  B,}, \
+  {  d,  k,  r,  y,  F, -A, -t, -m, -f, -b, -i, -p, -w, -D,  C,  v,  o,  h,  a,  g,  n,  u,  B, -E, -x, -q, -j, -c, -e, -l, -s, -z,}, \
+  {  e,  n,  w,  F, -y, -p, -g, -c, -l, -u, -D,  A,  r,  i,  a,  j,  s,  B, -C, -t, -k, -b, -h, -q, -z,  E,  v,  m,  d,  f,  o,  x,}, \
+  {  f,  q,  B, -A, -p, -e, -g, -r, -C,  z,  o,  d,  h,  s,  D, -y, -n, -c, -i, -t, -E,  x,  m,  b,  j,  u,  F, -w, -l, -a, -k, -v,}, \
+  {  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,  0, -t, -g, -g, -t,  0,  t,  g,  g,  t,}, \
+  {  h,  w, -B, -m, -c, -r,  0,  r,  c,  m,  B, -w, -h, -h, -w,  B,  m,  c,  r,  0, -r, -c, -m, -B,  w,  h,  h,  w, -B, -m, -c, -r,}, \
+  {  i,  z, -w, -f, -l, -C,  t,  c,  o,  F, -q, -a, -r,  E,  n,  d,  u, -B, -k, -g, -x,  y,  h,  j,  A, -v, -e, -m, -D,  s,  b,  p,}, \
+  {  j,  C, -r, -b, -u,  z,  g,  m,  F, -o, -e, -x,  w,  d,  p, -E, -l, -h, -A,  t,  a,  s, -B, -i, -k, -D,  q,  c,  v, -y, -f, -n,}, \
+  {  k,  F, -m, -i, -D,  o,  g,  B, -q, -e, -z,  s,  c,  x, -u, -a, -v,  w,  b,  t, -y, -d, -r,  A,  f,  p, -C, -h, -n,  E,  j,  l,}, \
+  {  l, -E, -h, -p,  A,  d,  t, -w, -a, -x,  s,  e,  B, -o, -i, -F,  k,  m, -D, -g, -q,  z,  c,  u, -v, -b, -y,  r,  f,  C, -n, -j,}, \
+  {  m, -B, -c, -w,  r,  h,  0, -h, -r,  w,  c,  B, -m, -m,  B,  c,  w, -r, -h,  0,  h,  r, -w, -c, -B,  m,  m, -B, -c, -w,  r,  h,}, \
+  {  n, -y, -c, -D,  i,  s, -t, -h,  E,  d,  x, -o, -m,  z,  b,  C, -j, -r,  u,  g, -F, -e, -w,  p,  l, -A, -a, -B,  k,  q, -v, -f,}, \
+  {  o, -v, -h,  C,  a,  D, -g, -w,  n,  p, -u, -i,  B,  b,  E, -f, -x,  m,  q, -t, -j,  A,  c,  F, -e, -y,  l,  r, -s, -k,  z,  d,}, \
+  {  p, -s, -m,  v,  j, -y, -g,  B,  d, -E, -a, -F,  c,  C, -f, -z,  i,  w, -l, -t,  o,  q, -r, -n,  u,  k, -x, -h,  A,  e, -D, -b,}, \
+  {  q, -p, -r,  o,  s, -n, -t,  m,  u, -l, -v,  k,  w, -j, -x,  i,  y, -h, -z,  g,  A, -f, -B,  e,  C, -d, -D,  c,  E, -b, -F,  a,}, \
+  {  r, -m, -w,  h,  B, -c,  0,  c, -B, -h,  w,  m, -r, -r,  m,  w, -h, -B,  c,  0, -c,  B,  h, -w, -m,  r,  r, -m, -w,  h,  B, -c,}, \
+  {  s, -j, -B,  a, -C, -i,  t,  r, -k, -A,  b, -D, -h,  u,  q, -l, -z,  c, -E, -g,  v,  p, -m, -y,  d, -F, -f,  w,  o, -n, -x,  e,}, \
+  {  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,  0,  g, -t, -t,  g,  0, -g,  t,  t, -g,}, \
+  {  u, -d,  B,  n, -k, -E,  g, -r, -x,  a, -y, -q,  h, -F, -j,  o,  A, -c,  v,  t, -e,  C,  m, -l, -D,  f, -s, -w,  b, -z, -p,  i,}, \
+  {  v, -a,  w,  u, -b,  x,  t, -c,  y,  s, -d,  z,  r, -e,  A,  q, -f,  B,  p, -g,  C,  o, -h,  D,  n, -i,  E,  m, -j,  F,  l, -k,}, \
+  {  w, -c,  r,  B, -h,  m,  0, -m,  h, -B, -r,  c, -w, -w,  c, -r, -B,  h, -m,  0,  m, -h,  B,  r, -c,  w,  w, -c,  r,  B, -h,  m,}, \
+  {  x, -f,  m, -E, -q,  b, -t, -B,  j, -i,  A,  u, -c,  p,  F, -n,  e, -w, -y,  g, -l,  D,  r, -a,  s,  C, -k,  h, -z, -v,  d, -o,}, \
+  {  y, -i,  h, -x, -z,  j, -g,  w,  A, -k,  f, -v, -B,  l, -e,  u,  C, -m,  d, -t, -D,  n, -c,  s,  E, -o,  b, -r, -F,  p, -a,  q,}, \
+  {  z, -l,  c, -q,  E,  u, -g,  h, -v, -D,  p, -b,  m, -A, -y,  k, -d,  r, -F, -t,  f, -i,  w,  C, -o,  a, -n,  B,  x, -j,  e, -s,}, \
+  {  A, -o,  c, -j,  v,  F, -t,  h, -e,  q, -C, -y,  m, -a,  l, -x, -D,  r, -f,  g, -s,  E,  w, -k,  b, -n,  z,  B, -p,  d, -i,  u,}, \
+  {  B, -r,  h, -c,  m, -w,  0,  w, -m,  c, -h,  r, -B, -B,  r, -h,  c, -m,  w,  0, -w,  m, -c,  h, -r,  B,  B, -r,  h, -c,  m, -w,}, \
+  {  C, -u,  m, -e,  d, -l,  t, -B, -D,  v, -n,  f, -c,  k, -s,  A,  E, -w,  o, -g,  b, -j,  r, -z, -F,  x, -p,  h, -a,  i, -q,  y,}, \
+  {  D, -x,  r, -l,  f, -a,  g, -m,  s, -y,  E,  C, -w,  q, -k,  e, -b,  h, -n,  t, -z,  F,  B, -v,  p, -j,  d, -c,  i, -o,  u, -A,}, \
+  {  E, -A,  w, -s,  o, -k,  g, -c,  b, -f,  j, -n,  r, -v,  z, -D, -F,  B, -x,  t, -p,  l, -h,  d, -a,  e, -i,  m, -q,  u, -y,  C,}, \
+  {  F, -D,  B, -z,  x, -v,  t, -r,  p, -n,  l, -j,  h, -f,  d, -b,  a, -c,  e, -g,  i, -k,  m, -o,  q, -s,  u, -w,  y, -A,  C, -E,}, \
+}
+
+
+// DST-7
+#define DEFINE_DST7_P4_MATRIX(a,b,c,d) \
+{ \
+  {  a,  b,  c,  d }, \
+  {  c,  c,  0, -c }, \
+  {  d, -a, -c,  b }, \
+  {  b, -d,  c, -a }, \
+}
+
+#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,}, \
+  {  c,  f,  h,  e,  b, -a, -d, -g,}, \
+  {  e,  g,  b, -c, -h, -d,  a,  f,}, \
+  {  g,  c, -d, -f,  a,  h,  b, -e,}, \
+  {  h, -a, -g,  b,  f, -c, -e,  d,}, \
+  {  f, -e, -a,  g, -d, -b,  h, -c,}, \
+  {  d, -h,  e, -a, -c,  g, -f,  b,}, \
+  {  b, -d,  f, -h,  g, -e,  c, -a,}, \
+}
+
+#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,}, \
+  {  c,  f,  i,  l,  o,  o,  l,  i,  f,  c,  0, -c, -f, -i, -l, -o,}, \
+  {  e,  j,  o,  m,  h,  c, -b, -g, -l, -p, -k, -f, -a,  d,  i,  n,}, \
+  {  g,  n,  l,  e, -b, -i, -p, -j, -c,  d,  k,  o,  h,  a, -f, -m,}, \
+  {  i,  o,  f, -c, -l, -l, -c,  f,  o,  i,  0, -i, -o, -f,  c,  l,}, \
+  {  k,  k,  0, -k, -k,  0,  k,  k,  0, -k, -k,  0,  k,  k,  0, -k,}, \
+  {  m,  g, -f, -n, -a,  l,  h, -e, -o, -b,  k,  i, -d, -p, -c,  j,}, \
+  {  o,  c, -l, -f,  i,  i, -f, -l,  c,  o,  0, -o, -c,  l,  f, -i,}, \
+  {  p, -a, -o,  b,  n, -c, -m,  d,  l, -e, -k,  f,  j, -g, -i,  h,}, \
+  {  n, -e, -i,  j,  d, -o,  a,  m, -f, -h,  k,  c, -p,  b,  l, -g,}, \
+  {  l, -i, -c,  o, -f, -f,  o, -c, -i,  l,  0, -l,  i,  c, -o,  f,}, \
+  {  j, -m,  c,  g, -p,  f,  d, -n,  i,  a, -k,  l, -b, -h,  o, -e,}, \
+  {  h, -p,  i, -a, -g,  o, -j,  b,  f, -n,  k, -c, -e,  m, -l,  d,}, \
+  {  f, -l,  o, -i,  c,  c, -i,  o, -l,  f,  0, -f,  l, -o,  i, -c,}, \
+  {  d, -h,  l, -p,  m, -i,  e, -a, -c,  g, -k,  o, -n,  j, -f,  b,}, \
+  {  b, -d,  f, -h,  j, -l,  n, -p,  o, -m,  k, -i,  g, -e,  c, -a,}, \
+}
+
+#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \
+{ \
+  {  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,  l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,  A,  B,  C,  D,  E,  F,}, \
+  {  c,  f,  i,  l,  o,  r,  u,  x,  A,  D,  F,  C,  z,  w,  t,  q,  n,  k,  h,  e,  b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E,}, \
+  {  e,  j,  o,  t,  y,  D,  D,  y,  t,  o,  j,  e,  0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e,  0,  e,  j,  o,  t,  y,  D,}, \
+  {  g,  n,  u,  B,  D,  w,  p,  i,  b, -e, -l, -s, -z, -F, -y, -r, -k, -d,  c,  j,  q,  x,  E,  A,  t,  m,  f, -a, -h, -o, -v, -C,}, \
+  {  i,  r,  A,  C,  t,  k,  b, -g, -p, -y, -E, -v, -m, -d,  e,  n,  w,  F,  x,  o,  f, -c, -l, -u, -D, -z, -q, -h,  a,  j,  s,  B,}, \
+  {  k,  v,  F,  u,  j, -a, -l, -w, -E, -t, -i,  b,  m,  x,  D,  s,  h, -c, -n, -y, -C, -r, -g,  d,  o,  z,  B,  q,  f, -e, -p, -A,}, \
+  {  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,  z,  m,  0, -m, -z, -z, -m,  0,  m,  z,}, \
+  {  o,  D,  t,  e, -j, -y, -y, -j,  e,  t,  D,  o,  0, -o, -D, -t, -e,  j,  y,  y,  j, -e, -t, -D, -o,  0,  o,  D,  t,  e, -j, -y,}, \
+  {  q,  E,  n, -c, -t, -B, -k,  f,  w,  y,  h, -i, -z, -v, -e,  l,  C,  s,  b, -o, -F, -p,  a,  r,  D,  m, -d, -u, -A, -j,  g,  x,}, \
+  {  s,  A,  h, -k, -D, -p,  c,  v,  x,  e, -n, -F, -m,  f,  y,  u,  b, -q, -C, -j,  i,  B,  r, -a, -t, -z, -g,  l,  E,  o, -d, -w,}, \
+  {  u,  w,  b, -s, -y, -d,  q,  A,  f, -o, -C, -h,  m,  E,  j, -k, -F, -l,  i,  D,  n, -g, -B, -p,  e,  z,  r, -c, -x, -t,  a,  v,}, \
+  {  w,  s, -d, -A, -o,  h,  E,  k, -l, -D, -g,  p,  z,  c, -t, -v,  a,  x,  r, -e, -B, -n,  i,  F,  j, -m, -C, -f,  q,  y,  b, -u,}, \
+  {  y,  o, -j, -D, -e,  t,  t, -e, -D, -j,  o,  y,  0, -y, -o,  j,  D,  e, -t, -t,  e,  D,  j, -o, -y,  0,  y,  o, -j, -D, -e,  t,}, \
+  {  A,  k, -p, -v,  e,  F,  f, -u, -q,  j,  B,  a, -z, -l,  o,  w, -d, -E, -g,  t,  r, -i, -C, -b,  y,  m, -n, -x,  c,  D,  h, -s,}, \
+  {  C,  g, -v, -n,  o,  u, -h, -B,  a,  D,  f, -w, -m,  p,  t, -i, -A,  b,  E,  e, -x, -l,  q,  s, -j, -z,  c,  F,  d, -y, -k,  r,}, \
+  {  E,  c, -B, -f,  y,  i, -v, -l,  s,  o, -p, -r,  m,  u, -j, -x,  g,  A, -d, -D,  a,  F,  b, -C, -e,  z,  h, -w, -k,  t,  n, -q,}, \
+  {  F, -a, -E,  b,  D, -c, -C,  d,  B, -e, -A,  f,  z, -g, -y,  h,  x, -i, -w,  j,  v, -k, -u,  l,  t, -m, -s,  n,  r, -o, -q,  p,}, \
+  {  D, -e, -y,  j,  t, -o, -o,  t,  j, -y, -e,  D,  0, -D,  e,  y, -j, -t,  o,  o, -t, -j,  y,  e, -D,  0,  D, -e, -y,  j,  t, -o,}, \
+  {  B, -i, -s,  r,  j, -A, -a,  C, -h, -t,  q,  k, -z, -b,  D, -g, -u,  p,  l, -y, -c,  E, -f, -v,  o,  m, -x, -d,  F, -e, -w,  n,}, \
+  {  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m, -m,  z,  0, -z,  m,  m, -z,  0,  z, -m,}, \
+  {  x, -q, -g,  E, -j, -n,  A, -c, -u,  t,  d, -B,  m,  k, -D,  f,  r, -w, -a,  y, -p, -h,  F, -i, -o,  z, -b, -v,  s,  e, -C,  l,}, \
+  {  v, -u, -a,  w, -t, -b,  x, -s, -c,  y, -r, -d,  z, -q, -e,  A, -p, -f,  B, -o, -g,  C, -n, -h,  D, -m, -i,  E, -l, -j,  F, -k,}, \
+  {  t, -y,  e,  o, -D,  j,  j, -D,  o,  e, -y,  t,  0, -t,  y, -e, -o,  D, -j, -j,  D, -o, -e,  y, -t,  0,  t, -y,  e,  o, -D,  j,}, \
+  {  r, -C,  k,  g, -y,  v, -d, -n,  F, -o, -c,  u, -z,  h,  j, -B,  s, -a, -q,  D, -l, -f,  x, -w,  e,  m, -E,  p,  b, -t,  A, -i,}, \
+  {  p, -F,  q, -a, -o,  E, -r,  b,  n, -D,  s, -c, -m,  C, -t,  d,  l, -B,  u, -e, -k,  A, -v,  f,  j, -z,  w, -g, -i,  y, -x,  h,}, \
+  {  n, -B,  w, -i, -e,  s, -F,  r, -d, -j,  x, -A,  m,  a, -o,  C, -v,  h,  f, -t,  E, -q,  c,  k, -y,  z, -l, -b,  p, -D,  u, -g,}, \
+  {  l, -x,  C, -q,  e,  g, -s,  E, -v,  j,  b, -n,  z, -A,  o, -c, -i,  u, -F,  t, -h, -d,  p, -B,  y, -m,  a,  k, -w,  D, -r,  f,}, \
+  {  j, -t,  D, -y,  o, -e, -e,  o, -y,  D, -t,  j,  0, -j,  t, -D,  y, -o,  e,  e, -o,  y, -D,  t, -j,  0,  j, -t,  D, -y,  o, -e,}, \
+  {  h, -p,  x, -F,  y, -q,  i, -a, -g,  o, -w,  E, -z,  r, -j,  b,  f, -n,  v, -D,  A, -s,  k, -c, -e,  m, -u,  C, -B,  t, -l,  d,}, \
+  {  f, -l,  r, -x,  D, -C,  w, -q,  k, -e, -a,  g, -m,  s, -y,  E, -B,  v, -p,  j, -d, -b,  h, -n,  t, -z,  F, -A,  u, -o,  i, -c,}, \
+  {  d, -h,  l, -p,  t, -x,  B, -F,  C, -y,  u, -q,  m, -i,  e, -a, -c,  g, -k,  o, -s,  w, -A,  E, -D,  z, -v,  r, -n,  j, -f,  b,}, \
+  {  b, -d,  f, -h,  j, -l,  n, -p,  r, -t,  v, -x,  z, -B,  D, -F,  E, -C,  A, -y,  w, -u,  s, -q,  o, -m,  k, -i,  g, -e,  c, -a,}, \
+}
+
+
+#endif DCT_AVX2_TABLES_H
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
new file mode 100644
index 00000000..b393bce6
--- /dev/null
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -0,0 +1,1544 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/*
+* \file
+*/
+
+#include "strategies/avx2/depquant-avx2.h"
+#include "strategyselector.h"
+
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include "dep_quant.h"
+
+#include <immintrin.h>
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "transform.h"
+#include "generic/quant-generic.h"
+#include "uvg_math.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+    { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+    { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+    { 98304,  98304,  98304,  98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+    {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+static const int g_riceT[4] = { 32,128, 512, 2048 };
+static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
+
+static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
+
+static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
+{
+  int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
+
+  __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
+  __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
+
+  __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]);
+  __m256i rd_cost_b = rd_cost_a;
+  __m256i rd_cost_z = rd_cost_a;
+
+  rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
+  rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
+
+
+  if (state->all_gte_four) {
+    // pqDataA
+    // In case the both levels are smaller than 4 or gte 4 avx 2 can be used
+    if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
+      // The coeffFracBits arrays are 6 elements long, so we need to offset the indices and gather is only eficient way to load the data
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
+      // RD costs are 64 bit, so we need to extend the 32 bit values
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
+    }
+
+    else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+
+      __m128i max_rice = _mm_set1_epi32(31);
+      value = _mm_min_epi32(value, max_rice);
+      // In the original implementation the goRiceTab is selected beforehand, but since we need to load from
+      // potentially four different locations, we need to calculate the offsets and use gather
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i *)&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqAs[4] = {0, 0, 3, 3};
+      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0};
+      // AVX2 cannot be used so we have to loop the values normally
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqA = pqAs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqA] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
+    }
+
+    // pqDataB, same stuff as for pqDataA
+    if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
+    } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+
+      __m128i max_rice = _mm_set1_epi32(31);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqBs[4] = {2, 2, 1, 1};
+      int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqB = pqBs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqB] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_b =
+        _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0]));
+    }
+
+    if (spt == SCAN_ISCSBB) {
+      // This loads values such as that the values are
+      // |State 0 Flag 0|State 0 Flag 1|State 1 Flag 0|State 1 Flag 1|State 2 Flag 0|State 2 Flag 1|State 3 Flag 0|State 3 Flag 1|
+      // By setting the flag 1 bits to zero we get the flag 0 values as 64 bit integers (even) variable which we can be summed to the rd_cost
+      // Flag 1 values can be shifted 32 to right and again we have 64 bit integeres holding the values (odd) which can be summed to the rd_cost
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even      = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
+      __m256i odd  = _mm256_srli_epi64(original, 32);
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, even);
+    } else if (spt == SCAN_SOCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+
+      // Same here
+      __m256i m_sigFracBits_0 = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
+      __m256i m_sigFracBits_1 = _mm256_srli_epi64(original, 32);
+
+      original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
+      __m256i m_sbbFracBits_1 = _mm256_srli_epi64(original, 32);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
+    }
+    else {
+      int num_sig_sbb;
+      memcpy(&num_sig_sbb, &state->m_numSigSbb[start], 4);
+      // numSigSbb only has values 1 or zero, so if all 4 values are 1 the complete value is 0x01010101
+      if (num_sig_sbb == 0x01010101) {
+        __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even      = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff));
+      __m256i odd  = _mm256_srli_epi64(original, 32);
+        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd);
+        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd);
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even);     
+      }
+      else if (num_sig_sbb == 0) {
+        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
+      }
+
+      else {
+        const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3};
+        _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a);
+        _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b);
+        _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z);
+        for (int i = 0; i < 4; i++) {
+          const int state_offset = start + i;
+          if (state->m_numSigSbb[state_offset]) {
+            temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
+          } else {
+            temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
+          }
+        }
+        rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+        rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+        rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
+      }
+    }
+  } else if (state->all_lt_four) {
+    __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
+    __m128i max_rice = _mm_set1_epi32(31);
+    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start]));
+    // RD cost A
+    {
+      __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
+      // Calculate mask for pqDataA->absLevel <= state->m_goRiceZero
+      // The mask is reverse of the one that is used in the scalar code so the values are in other order in blendv
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
+
+      // pqDataA->absLevel < RICEMAX ? pqDataA->absLevel : RICEMAX - 1
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
+
+      // pqDataA->absLevel - 1
+      __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
+
+      // Again calculate the offset for the different go_rice_tabs
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      //(1 << SCALE_BITS) + goRiceTab[selected]
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost b, same as RD cost A
+    {
+      __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
+
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost Z
+    {
+      // This time the go_rice_tab is offset with only the go_rize_zero
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
+    }
+  } else {
+    const int pqAs[4] = {0, 0, 3, 3};
+    const int pqBs[4] = {2, 2, 1, 1};
+    const int decision_a[4] = {0, 2, 1, 3};
+    for (int i = 0; i < 4; i++) {
+      const int      state_offset = start + i;
+      const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+      const int pqA = pqAs[i];
+      const int pqB = pqBs[i];
+      int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+      int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
+      int64_t rdCostZ = state->m_rdCost[state_offset];
+      if (state->m_remRegBins[state_offset] >= 4) {
+        if (pqDataA->absLevel[pqA] < 4) {
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (pqDataA->absLevel[pqB] < 4) {
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (spt == SCAN_ISCSBB) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else if (spt == SCAN_SOCSBB) {
+          rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
+        } else if (state->m_numSigSbb[state_offset]) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else {
+          rdCostZ = decisions->rdCost[decision_a[i]];
+        }
+      } else {
+        rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
+        rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
+        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+      }
+      temp_rd_cost_a[i] = rdCostA;
+      temp_rd_cost_b[i] = rdCostB;
+      temp_rd_cost_z[i] = rdCostZ;
+    }
+    rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+    rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+    rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
+  }
+  // Re order the cost so that cost of state 0 is in the first element state 1 in second etc
+  rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
+  rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
+  rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
+  __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost);
+
+  __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel);
+  __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
+  __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
+  __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+
+  // Store data for all of the cost so that the lower 32 bits have coefficient magnitude and upper have the previous state
+  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
+  __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
+  __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
+  __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
+
+  __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b);
+  __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
+  __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
+
+  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_decision, rd_cost_z);
+  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_decision, rd_cost_z, z_vs_decision);
+  __m256i cheaper_second_data = _mm256_blendv_epi8(decision_data, z_data, z_vs_decision);
+
+  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_second, cheaper_first);
+  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_second, cheaper_first, final_decision);
+  __m256i final_data = _mm256_blendv_epi8(cheaper_second_data, cheaper_first_data, final_decision);
+
+  _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
+  final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+  _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
+}
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision*                        decision,
+  const all_depquant_states* const state,
+  int                              decision_id,
+  int                              skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block *              qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos,
+                                  const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext,
+                                  const uint32_t width_in_sbb, const uint32_t height_in_sbb,
+                                  const uint32_t next_sbb_right, const uint32_t next_sbb_below,
+                                  const Decision* decisions)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+  bool all_above_minus_two = true;
+  bool all_between_zero_and_three = true;
+  bool all_above_four = true;
+
+  
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  for (int i = 0; i < 4; ++i) {
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
+    all_above_four &= decisions->prevId[i] >= 4;
+  }
+  if (all_above_minus_two) {
+    bool all_have_previous_state = true;
+    __m128i prev_state;
+    __m128i prev_state_no_offset;
+    __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
+    __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12);
+    if (all_above_four) {
+      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
+      prev_state = _mm_add_epi32(
+        prev_state,
+            prev_state_no_offset
+      );
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      memset(state->m_absLevels[state_offset >> 2], 0, 64 * sizeof(uint8_t));    
+      
+    } else if (all_between_zero_and_three) {
+      prev_state_no_offset = _mm_load_si128((const __m128i*)decisions->prevId);
+      prev_state = _mm_add_epi32(
+        prev_state_no_offset,
+        _mm_set1_epi32(ctxs->m_prev_state_offset)
+      );
+      // Set the high bytes to 0xff so that the shuffle will set them to zero and it won't cause problems with the min_epi32
+      __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
+      __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
+      num_sig_sbb = _mm_or_si128(
+        num_sig_sbb,
+        _mm_min_epi32(abs_level, _mm_set1_epi32(1))
+      );
+
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
+      int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
+
+      // Set this so that the temp_prev_state has the previous state set into the first 4 bytes and duplicated to the second 4 bytes
+      __m128i temp_prev_state = _mm_shuffle_epi8(prev_state_no_offset, control);
+      __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state);
+      // Duplicate the state all over the vector so that all 32 bytes hold the previous states
+      prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0);
+      // Increment the second set by four, third by eight and fourth by twelve and repeat for the second lane
+      __m256i temp_add = _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c, 0, 0x04040404, 0x08080808, 0x0c0c0c0c);
+      prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
+      for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
+        __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
+        data = _mm256_shuffle_epi8(data, prev_state_256);
+        _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
+      }
+    } else {
+      // TODO: it would be possible to do the absLevels update with avx2 even here just would need to set the shuffle mask to
+      // 0xff for the states that don't have previous state or the previous state is a skip state
+      int prev_state_s[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i) {
+        const int decision_id = i;
+        const int curr_state_offset = state_offset + i;
+        if (decisions->prevId[decision_id] >= 4) {
+          prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+          state->m_numSigSbb[curr_state_offset] = 0;
+          for (int j = i; j < 64; j += 4) {
+            state->m_absLevels[curr_state_offset >> 2][j] = 0;
+          }
+        } else if (decisions->prevId[decision_id] >= 0) {
+          prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id];
+          for (int j = 0; j < 64; j += 4) {
+            state->m_absLevels[curr_state_offset >> 2][j + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][j + decisions->prevId[decision_id]];
+          }
+        } else {
+          state->m_numSigSbb[curr_state_offset] = 1;
+          for (int j = i; j < 64; j += 4) {
+            state->m_absLevels[curr_state_offset >> 2][j] = 0;
+          }
+          all_have_previous_state = false;
+        }
+      }
+      prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255));
+    max_abs = _mm_shuffle_epi8(max_abs, control);
+    uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
+    memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs, 4);
+
+
+    // Update common context
+    __m128i last;
+    {
+      const uint32_t numSbb = width_in_sbb * height_in_sbb;
+      common_context* cc = &ctxs->m_common_context;
+      size_t   setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+      uint8_t* sbbFlags  = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags;
+      uint8_t* levels   = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scan_pos * 4;
+      uint8_t* levels_in = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels + scan_pos * 4;
+      int      previous_state_array[4];
+      _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
+
+      if (all_have_previous_state) {
+        __m128i temp_p_state = _mm_shuffle_epi8(prev_state, control);
+        // Similarly to how the abs level was done earlier set the previous state duplicated across the lane
+        __m128i ref_sbb_ctx_offset = _mm_load_si128((__m128i*)ctxs->m_allStates.m_refSbbCtxId);
+        ref_sbb_ctx_offset = _mm_shuffle_epi8(ref_sbb_ctx_offset, temp_p_state);
+        // numSbb is two or four, in case it is one this function is never called
+        if (numSbb <= 4) {
+          __m128i incremented_ref_sbb_ctx_offset = _mm_add_epi8(
+            ref_sbb_ctx_offset,
+            _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12)
+          );
+          // In case the ref_sbb_ctx is minus one the values need to be set to zero, which is achieved by
+          // first finding which states have the minus one and then the blend is used after the load to
+          // set the corresponding values to zero
+          __m128i blend_mask = _mm_cmpeq_epi8(ref_sbb_ctx_offset, _mm_set1_epi32(0xffffffff));
+          __m128i sbb_flags = _mm_loadu_si128((__m128i*)cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags);
+          sbb_flags = _mm_shuffle_epi8(sbb_flags, incremented_ref_sbb_ctx_offset);
+          sbb_flags = _mm_blendv_epi8(sbb_flags, _mm_set1_epi64x(0), blend_mask);
+          if (numSbb == 2) {
+            uint64_t temp = _mm_extract_epi64(sbb_flags, 0);
+            memcpy(sbbFlags, &temp, 8);
+          } else {
+            _mm_storeu_si128((__m128i*)sbbFlags, sbb_flags);
+          }
+        } else {
+          __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset);
+          extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0);
+          __m256i inc_ref_state = _mm256_add_epi8(
+            extended_ref_state,
+            _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c)
+          );
+          // Unlike the case for two or four sbb, the blendv is used to set the shuffle mask to -1 so that
+          // the shuffle will set the values to zero. Its better to do this way here so that the blendv is
+          // not called in the loop, and the other is done the otherway because I implemented it first
+          // and only realized afterwards that this order is better
+          __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff));
+          inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask);
+          for (int i = 0; i < numSbb * 4; i += 32) {
+            __m256i sbb_flags = _mm256_loadu_si256((__m256i*)(&cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i]));
+            sbb_flags = _mm256_shuffle_epi8(sbb_flags, inc_ref_state);
+            _mm256_store_si256((__m256i*)&sbbFlags[i], sbb_flags);
+          }
+        }
+        // The first 16 variables will be loaded from the previous state so this can be started from 16
+        int levels_start = 16;
+        // Do avx2 optimized version for the amount that is divisible by 8 (four states of 8 1-byte values)
+        const uint64_t limit        = setCpSize & ~(8 - 1);
+        if (levels_start < limit) {
+          // Overall this is the same to the numSbb > 4
+          __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset);
+          extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0);
+          __m256i inc_ref_state = _mm256_add_epi8(
+            extended_ref_state,
+            _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c)
+          );
+          __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff));
+          inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask);
+          for (; levels_start < limit; levels_start += 8) {
+             __m256i levels_v = _mm256_loadu_si256((__m256i*)(&levels_in[levels_start * 4]));
+            levels_v = _mm256_shuffle_epi8(levels_v, inc_ref_state);
+             _mm256_store_si256((__m256i*)&levels[levels_start * 4], levels_v);
+          }
+        }
+        uint8_t ref_sbb[4];
+        int     temp_sbb_ref = _mm_extract_epi32(ref_sbb_ctx_offset, 0);
+        memcpy(ref_sbb, &temp_sbb_ref, 4);
+        // Do the excess that is not divisible by 8
+        for (;levels_start < setCpSize; ++levels_start) {
+          uint8_t new_values[4];
+          new_values[0] = ref_sbb[0] != 0xff ? levels_in[levels_start * 4 + ref_sbb[0]] : 0;
+          new_values[1] = ref_sbb[1] != 0xff ? levels_in[levels_start * 4 + ref_sbb[1]] : 0;
+          new_values[2] = ref_sbb[2] != 0xff ? levels_in[levels_start * 4 + ref_sbb[2]] : 0;
+          new_values[3] = ref_sbb[3] != 0xff ? levels_in[levels_start * 4 + ref_sbb[3]] : 0;
+          memcpy(&levels[levels_start * 4], new_values, 4);
+        }
+
+      }
+      else {
+        //TODO: This could also be done using avx2 just need to check for both wheter the previous state
+        // is minus one and that if the ref_sbb_ctx_id is minus one. 
+        for (int curr_state = 0; curr_state < 4; ++curr_state) {
+          const int p_state = previous_state_array[curr_state];
+          if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
+            const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state];
+            for (int i = 0; i < numSbb; ++i) {
+              sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb];
+            }
+            for (int i = 16; i < setCpSize; ++i) {
+              levels[i * 4 + curr_state] = levels_in[i * 4 + prev_sbb];
+            }
+          } else {
+            for (int i = 0; i < numSbb; ++i) {
+              sbbFlags[i * 4 + curr_state] = 0;
+            }
+            for (int i = 16; i < setCpSize; ++i) {
+              levels[ i * 4 + curr_state] = 0;
+            }
+          }
+        }
+      }
+      memcpy(levels, ctxs->m_allStates.m_absLevels[state_offset / 4], 64);
+      memcpy(&sbbFlags[cg_pos * 4], &ctxs->m_allStates.m_numSigSbb[state_offset], 4);
+      
+      __m128i sbb_right = next_sbb_right ?
+          _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_right * 4])) :
+          _mm_set1_epi32(0);
+      
+      __m128i sbb_below = next_sbb_below ?
+        _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_below * 4])) :
+        _mm_set1_epi32(0);
+
+      __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
+      sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
+      // Gather is not necessary here put it would require at least five operation to do the same thing
+      // so the performance gain in my opinion is not worth the readability loss
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long int *)cc->m_sbbFlagBits[0], sig_sbb, 8);
+      _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
+
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      memset(&state->m_goRicePar[state_offset], 0, 4);
+
+      uint8_t states[4] = {0, 1, 2, 3};
+      memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
+      if (all_have_previous_state) {
+        __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
+        _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
+      } else {
+        const int temp = (state->effWidth * state->effHeight * 28) / 16;
+        for (int i = 0; i < 4; ++i) {
+          if (previous_state_array[i] != -1) {
+            state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]];
+          } else {
+            state->m_remRegBins[i + state_offset] = temp;
+          }
+        }
+      }
+      
+      const int        scanBeg = scan_pos - 16;
+      const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg * 4;
+
+      __m128i          ones = _mm_set1_epi32(1);
+      __m128i         fours = _mm_set1_epi32(4);
+      __m256i          all[4];
+      uint64_t         temp[4];
+
+      for (int id = 0; id < 16; id++, nbOut++) {
+        if (nbOut->num == 0) {
+          temp[id % 4] = 0;
+          if (id % 4 == 3) {
+            all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
+          }
+          continue;
+        }
+        __m128i sum_abs = _mm_set1_epi32(0);
+        __m128i sum_abs_1 = _mm_set1_epi32(0);
+        __m128i sum_num = _mm_set1_epi32(0);
+        switch (nbOut->num) {
+        case 5:
+          {
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[4] * 4])));
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)
+              )
+            );
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+          }
+        case 4: {
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[3] * 4])));
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 3: {
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[2] * 4])));
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 2: {
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[1] * 4])));
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 1: {
+            __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[0] * 4])));
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+            break;
+        default:
+          assert(0);
+        }
+        sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
+        sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
+        __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
+        template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1);
+        __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
+        __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
+        temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
+        if (id % 4 == 3) {
+          all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
+          last = template_ctx_init;
+        }
+      }
+      
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][0]), all[0]);
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][16]), all[1]);
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][32]), all[2]);
+      _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]);
+      
+      memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4);      
+    }
+    // End update common context
+
+    __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
+    __m128i sum_abs1 = _mm_and_si128(
+      _mm_srli_epi32(last, 3),
+      _mm_set1_epi32(31));
+
+    __m128i sum_abs_min = _mm_min_epi32(
+      _mm_set1_epi32(3),
+      _mm_srli_epi32(
+        _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)),
+        1));
+
+    __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+    offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+    offsets         = _mm_add_epi32(offsets, sum_abs_min);
+    __m256i sig_frac_bits = _mm256_i32gather_epi64((long long const*)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
+    _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+
+    __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+    __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+    uint32_t sum_gt1_s[4];
+    _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
+    // These are 192 bits so no benefit from using avx2
+    for (int i = 0; i < 4; ++i) {
+      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
+    }
+  }
+  else {
+    for (int i = 0; i < 4; i++) {
+      uvg_dep_quant_update_state_eos(
+        ctxs,
+        scan_pos,
+        cg_pos,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        width_in_sbb,
+        height_in_sbb,
+        next_sbb_right,
+        next_sbb_below,
+        decisions,
+        i);
+    }
+  }
+}
+
+static INLINE void update_states_avx2(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+
+  bool all_non_negative = true;
+  bool all_above_minus_two = true;
+  bool all_minus_one = true;
+  for (int i = 0; i < 4; ++i) {
+    all_non_negative &= decisions->prevId[i] >= 0;
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_minus_one &= decisions->prevId[i] == -1;
+  }
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  if (all_above_minus_two) {
+
+    bool    rem_reg_all_gte_4 = true;
+    bool    rem_reg_all_lt4 = true;
+    __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
+
+    __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
+    if (all_non_negative) {
+      __m128i prv_states_o  = _mm_load_si128((__m128i const*)decisions->prevId);
+      __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      __m128i prv_states     = _mm_add_epi32(prv_states_o, prev_offset);
+      __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
+
+      // sig_sbb values matter only whether they are one or zero so make sure that they stay at one or zero
+      // which allows some optimizations when handling the values in update_state_eos_avx2
+      __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
+      sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
+      __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
+      has_coeff         = _mm_shuffle_epi8(has_coeff, control);
+      sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
+      int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
+
+      // These following two are jus shuffled and then extracted the 4 bytes that store the values
+      __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
+      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
+      int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
+      memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
+      
+      __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar);
+      go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
+      int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+      memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+      // Again gather is not necessary but it is easier to read and shouldn't have too large of a performance hit
+      // Should be true for all gathers here
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sbbFracBits[0], prv_states, 8);
+      _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
+
+      // Next three lines: state->m_remRegBins = prvState->m_remRegBins - 1;
+      __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
+      __m128i ones = _mm_set1_epi32(1);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
+
+      __m128i reg_bins_sub = _mm_set1_epi32(0);
+      // Next two lines: (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3)
+      __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
+      __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
+
+      // Depending on whether the rem_reg_bins are smaller than four or not,
+      // the reg_bins_sub is either 0 or result of the above operation
+      __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
+      _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
+
+      // Save whether all rem_reg_bins are smaller than four or not and gte 4 as these
+      // are needed in multiple places
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
+      int     bit_mask = _mm_movemask_epi8(mask);           
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask); 
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+
+      // This is the same as in update_state_eos_avx2
+      __m128i temp_prev_state = _mm_shuffle_epi8(prv_states_o, control);
+      __m256i prev_state_256  = _mm256_castsi128_si256(temp_prev_state);
+      prev_state_256          = _mm256_permute4x64_epi64(prev_state_256, 0);
+      __m256i temp_add        = _mm256_setr_epi32(
+        0,
+        0x04040404,
+        0x08080808,
+        0x0c0c0c0c,
+        0,
+        0x04040404,
+        0x08080808,
+        0x0c0c0c0c);
+      prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
+      for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
+        __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
+        data = _mm256_shuffle_epi8(data, prev_state_256);
+        _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
+      }
+
+      // This is overall the same as absLevels but since the ctx values are two bytes all of the
+      // masks have to account for that
+      __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
+      __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16);
+      prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
+      prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
+      prev_state_full  = _mm256_slli_epi16(prev_state_full, 1);
+      temp_add = _mm256_setr_epi8(
+        0, 1, 0, 1, 0, 1, 0, 1,
+        8, 9, 8, 9, 8, 9, 8, 9, 
+        16, 17, 16, 17, 16, 17, 16, 17,
+        24, 25, 24, 25, 24, 25, 24, 25);
+      prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
+
+      for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint16_t)))) {
+         __m256i data   = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i]));
+        data = _mm256_shuffle_epi8(data, prev_state_full);
+        _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data);
+      }
+    }
+    else if (all_minus_one) {
+      memset(&state->m_numSigSbb[state_offset], 1, 4);
+      memset(&state->m_refSbbCtxId[state_offset], -1, 4);
+
+      const int a = (state->effWidth * state->effHeight * 28) / 16;
+
+      __m128i   rem_reg_bins = _mm_set1_epi32(a);
+      __m128i   sub = _mm_blendv_epi8(
+        _mm_set1_epi32(3),
+        abs_level,
+        _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
+      );
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
+      _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
+      int     bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+      
+      memset(state->m_absLevels[state_offset >> 2], 0, 16 * sizeof(uint8_t) * 4);
+      memset(state->m_ctxInit[state_offset >> 2], 0, 16 * sizeof(uint16_t) * 4);
+      
+    }
+    else {
+      for (int i = 0; i< 4; ++i) {
+        const int decision_id = i;
+        const int state_id = state_offset + i;
+        if (decisions->prevId[decision_id] >= 0) {
+          const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
+          state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+          state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+          state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+          state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+          state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+          if (state->m_remRegBins[state_id] >= 4) {
+            state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          }
+        } else {
+          state->m_numSigSbb[state_id] = 1;
+          state->m_refSbbCtxId[state_id] = -1;
+          int ctxBinSampleRatio = 28;
+          state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+        }
+        rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
+        rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
+      }
+      {
+        // Same as for the all_non_negative but use blendv to set the shuffle mask to -1 for the states that do not have previous state
+        __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
+        __m256i shuffle_mask = _mm256_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
+        prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
+        __m256i temp_add        = _mm256_setr_epi32(
+          0,
+          0x04040404,
+          0x08080808,
+          0x0c0c0c0c,
+          0,
+          0x04040404,
+          0x08080808,
+          0x0c0c0c0c);
+        __m256i comp_mask = _mm256_cmpeq_epi8(prev_state_full, _mm256_set1_epi64x(-1));
+        prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
+        prev_state_full = _mm256_blendv_epi8(prev_state_full, _mm256_set1_epi64x(-1), comp_mask);
+        for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
+          __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
+          data = _mm256_shuffle_epi8(data, prev_state_full);
+          _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
+        }
+      }
+
+      {
+        __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
+        __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
+        prev_state_full  = _mm256_permute4x64_epi64(prev_state_full, 0);
+        __m256i comp_mask = _mm256_cmpeq_epi8(prev_state_full, _mm256_set1_epi64x(-1));
+        prev_state_full  = _mm256_slli_epi16(prev_state_full, 1);
+        __m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
+
+        prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
+        prev_state_full = _mm256_blendv_epi8(prev_state_full, _mm256_set1_epi64x(-1), comp_mask);
+
+        for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) {
+          __m256i data   = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i]));
+          data = _mm256_shuffle_epi8(data, prev_state_full);
+          _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data);
+        }
+      }
+    }
+    uint32_t level_offset   = scan_pos & 15;
+    __m128i  max_abs        = _mm_min_epi32(abs_level, _mm_set1_epi32(255));
+    max_abs                 = _mm_shuffle_epi8(max_abs, control);
+    uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
+    memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs,4);
+
+    state->all_gte_four = rem_reg_all_gte_4;
+    state->all_lt_four = rem_reg_all_lt4;
+
+    if (rem_reg_all_gte_4) {
+      const __m128i  ones = _mm_set1_epi32(1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
+      __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
+      tinit = _mm_cvtepi16_epi32(tinit); 
+      __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
+      __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
+
+      uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
+      switch (numIPos) {
+      case 5:
+        {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(t, ones));
+        }
+      case 4:
+        {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+        }
+      case 3:
+        {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+        }
+      case 2:
+        {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
+        __m128i min_arg = _mm_min_epi32(
+              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+              t
+            );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+        }
+      case 1: {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+            );
+          sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+        } break;
+      default:
+          assert(0);
+      }
+      __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+      __m128i  offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+      offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+      __m128i temp = _mm_min_epi32(
+        _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
+        _mm_set1_epi32(3));
+      offsets = _mm_add_epi32(offsets, temp);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+      _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+      sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+      sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
+      uint32_t sum_gt1_s[4];
+      _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
+      }
+
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(255));
+      switch (numIPos) {
+        case 5:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 4:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 3:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 2:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 1:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          } break;
+        default:
+          assert(0);
+      }
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        // int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+        __m128i sum_all = _mm_max_epi32(
+          _mm_min_epi32(
+            _mm_set1_epi32(31),
+            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
+          _mm_set1_epi32(0));
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+      }
+    }
+
+    else if (rem_reg_all_lt4) {
+      uint8_t*       levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
+      __m128i   tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4]));
+      tinit = _mm_cvtepi16_epi32(tinit); 
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      sum_abs         = _mm_min_epi32(sum_abs, _mm_set1_epi32(255));
+      switch (numIPos) {
+        case 5:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 4:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 3:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 2:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 1:
+          {
+          __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          } break;
+        default:
+          assert(0);
+      }
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs);
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+        // This cannot be vectorized because there is no way to dynamically shift values
+        for (int i = 0; i < 4; ++i) {
+          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];          
+        }
+      }
+
+    }
+    else {
+      for (int i = 0; i < 4; ++i) {
+        const int state_id = state_offset + i;
+        uint8_t*  levels = (uint8_t*)(state->m_absLevels[state_offset >> 2]);
+        if (state->m_remRegBins[state_id] >= 4) {
+          coeff_t tinit = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i];
+          coeff_t sumAbs1 = (tinit >> 3) & 31;
+          coeff_t sumNum = tinit & 7;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
+    sumAbs1 += MIN(4 + (t & 1), t);                \
+    sumNum += !!t;                                 \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          coeff_t sumGt1 = sumAbs1 - sumNum;
+          state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+          state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+          memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+
+
+          coeff_t sumAbs = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i] >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            assert(0 && "Not implemented for avx2");
+          } else {
+            int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+          }
+        } else {
+          coeff_t sumAbs = (state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]) >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            assert(0 && "Not implemented for avx2");
+          } else {
+            sumAbs = MIN(31, sumAbs);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+          }
+          state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < 4; ++i) {
+      state->all_gte_four = true;
+      state->all_lt_four = true;
+      uvg_dep_quant_update_state(
+        ctxs,
+        numIPos,
+        scan_pos,
+        decisions,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        next_nb_info_ssb,
+        baseLevel,
+        extRiceFlag,
+        i);
+    }
+  }
+}
+
+void uvg_dep_quant_decide_and_update_avx2(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+void uvg_find_first_non_zero_avx2(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, const int width, const int height)
+{
+  const int default_quant_coeff = dep_quant_context->m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context->m_quant->m_thresLast;
+  int temp = *firstTestPos;
+  if (enableScalingLists) {
+    for (; temp >= 0; (temp)--) {
+      coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]);
+      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+        break;
+      }
+    }
+  } else {
+    coeff_t thresTmp = thres / (4 * default_quant_coeff);
+    if (temp >= 16 && height >= 4) {
+      __m256i th = _mm256_set1_epi16(thresTmp);
+      temp -= 15;
+      for (; temp >= 0; temp -= 16) {
+        __m256i sbb_data;
+        if (width <= 4) {
+          sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]);
+        } else if (width == 8) {
+          uint32_t i     = scan[temp];
+          __m256i  first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]);
+          __m256i  second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i+ 12]);
+          sbb_data       = _mm256_blend_epi32(first, second, 204);
+        } else {
+          int16_t temp_d[16];
+          uint32_t i = scan[temp];
+          memcpy(temp_d, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 4, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 8, &srcCoeff[i], 8);
+          i += width;
+          memcpy(temp_d + 12, &srcCoeff[i], 8);
+
+          sbb_data = _mm256_loadu_si256((__m256i const*)temp_d);
+        }
+        sbb_data = _mm256_abs_epi16(sbb_data);
+
+        __m256i a = _mm256_cmpgt_epi16(sbb_data, th);
+        if (!_mm256_testz_si256(a, a))
+        {
+          if (temp >= 0) {
+            temp += 15;
+          }
+          break;
+        }
+      }
+    }
+    for (;temp >= 0; temp--) {
+      if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+        break;
+      }
+    }
+  }
+
+  *firstTestPos = temp;
+}
+
+
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2);
+  success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "avx2", 40, &uvg_find_first_non_zero_avx2);
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
+
+  return success;
+}
diff --git a/src/strategies/avx2/depquant-avx2.h b/src/strategies/avx2/depquant-avx2.h
new file mode 100644
index 00000000..e6db110c
--- /dev/null
+++ b/src/strategies/avx2/depquant-avx2.h
@@ -0,0 +1,46 @@
+#ifndef STRATEGIES_DEPQUANT_AVX2_H_
+#define STRATEGIES_DEPQUANT_AVX2_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for AVX2.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_AVX2_H_
diff --git a/src/strategies/avx2/encode_coding_tree-avx2.h b/src/strategies/avx2/encode_coding_tree-avx2.h
index ae1845c8..ea7f077e 100644
--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@@ -38,13 +38,14 @@
  * Functions for writing the coding quadtree and related syntax.
  */
 
+#include "cu.h"
 #include "encoderstate.h"
 #include "global.h"
 
 void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                                cabac_data_t * const cabac,
                                const coeff_t *coeff,
-                               uint8_t width,
+                               const cu_loc_t *loc,
                                uint8_t type,
                                int8_t scan_mode,
                                int8_t tr_skip,
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 53282e87..838bad91 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -42,10 +42,9 @@
 #include "strategyselector.h"
 #include "strategies/missing-intel-intrinsics.h"
 
-
  /**
  * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU locationand size data.
  * \param intra_mode    Angular mode in range 2..34.
  * \param channel_type  Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
@@ -54,20 +53,28 @@
  * \param multi_ref_idx Reference line index for use with MRL.
  */
 static void uvg_angular_pred_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const int_fast8_t intra_mode,
   const int_fast8_t channel_type,
   const uvg_pixel *const in_ref_above,
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode,
+  const int cu_dim)
 {
-  
-  assert(log2_width >= 2 && log2_width <= 5);
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
   assert(intra_mode >= 2 && intra_mode <= 66);
 
   // TODO: implement handling of MRL
   uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t isp = isp_mode;
 
   __m256i p_shuf_01 = _mm256_setr_epi8(
     0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04,
@@ -142,7 +149,6 @@ static void uvg_angular_pred_avx2(
   //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 };
   uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
   uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;
 
   int32_t pred_mode = intra_mode; // ToDo: handle WAIP
 
@@ -345,13 +351,13 @@ static void uvg_angular_pred_avx2(
 
      
       // PDPC
-      bool PDPC_filter = (width >= 4 || channel_type != 0);
+      bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0);
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
         }
         else if (mode_disp > 0) {
-          PDPC_filter = (scale >= 0);
+          PDPC_filter &= (scale >= 0);
         }
       }
       if(PDPC_filter) {
@@ -497,20 +503,27 @@ static void uvg_angular_pred_avx2(
 
 /**
  * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
  * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
  * \param dst           Buffer of size width*width.
  */
 static void uvg_intra_pred_planar_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
   const uint8_t *const ref_top,
   const uint8_t *const ref_left,
   uint8_t *const dst)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
 
-  const int_fast8_t width = 1 << log2_width;
   const uint8_t top_right = ref_top[width + 1];
   const uint8_t bottom_left = ref_left[width + 1];
 
@@ -964,12 +977,17 @@ static void uvg_intra_pred_filtered_dc_avx2(
 */
 static void uvg_pdpc_planar_dc_avx2(
   const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   assert(mode == 0 || mode == 1);  // planar or DC
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
   __m256i shuf_mask_byte = _mm256_setr_epi8(
     0, -1, 0, -1, 0, -1, 0, -1,
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index df90f149..26eb535e 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
 
 static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
                  const int ref_stride, const int rec_stride,
-                 const int width)
+                 const int width, const int height)
 {
+  assert(width == height && "Non square not yet implemented");
   __m256i ssd_part;
   __m256i diff = _mm256_setzero_si256();
   __m128i sum;
@@ -1743,40 +1744,32 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
   return diff;
 }
 
-static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
-
+static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   __m128i diff = _mm_setzero_si128();
   switch (width) {
   case 4:
-    diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[0]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[4]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[8]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[12]), diff);
+    for (int y = 0; y < height; y+=4) {
+      diff = get_residual_4x1_avx2(ref_in + y * ref_stride, pred_in + y * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 1) * ref_stride, pred_in + (y + 1) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 4]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 2) * ref_stride, pred_in + (y + 2) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 8]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 3) * ref_stride, pred_in + (y + 3) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 12]), diff);
+    }
     break;
   case 8:
-    diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[0]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[8]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[16]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[24]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[32]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[40]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[48]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[56]), diff);
+    for (int y = 0; y < height; y += 2) {
+      diff = get_residual_8x1_avx2(&ref_in[y * ref_stride], &pred_in[y * pred_stride]);
+      _mm_storeu_si128((__m128i*) & (residual[y * 8]), diff);
+      diff = get_residual_8x1_avx2(&ref_in[(y + 1) * ref_stride], &pred_in[(y + 1) * pred_stride]);
+      _mm_storeu_si128((__m128i*) & (residual[y*8 + 8]), diff);
+    }
     break;
   default:
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; x += 16) {
         diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]);
         _mm_storeu_si128((__m128i*) & residual[x + y * width], diff);
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 5c39fe11..cada96f1 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -380,20 +380,24 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  uint32_t log2_tr_width = uvg_math_floor_log2(height);
-  uint32_t log2_tr_height = uvg_math_floor_log2(width);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+  
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+
   uint32_t ac_sum = 0;
   int32_t last_cg = -1;
 
@@ -402,7 +406,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
   // Loading once is enough if scaling lists are not off
   __m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256();
   if (!(state->encoder_control->scaling_list.enable)) {
-    low_b  = _mm256_set1_epi32(quant_coeff[0]);
+    low_b  = _mm256_set1_epi32(default_quant_coeff);
     high_b = low_b;
   }
 
@@ -579,33 +583,60 @@ static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint
   return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec));
 }
 
-static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){
+static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width, int height){
 
-  switch (width) {
+  if (height == width || width >= 16) {
+    switch (width) {
     case 4:
-      *(int32_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
-      *(int32_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
-      *(int32_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
-      *(int32_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
+      *(int32_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
+      *(int32_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
+      *(int32_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
+      *(int32_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
       break;
     case 8:
-      *(int64_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
-      *(int64_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
-      *(int64_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
-      *(int64_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
-      *(int64_t*)&(rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride);
-      *(int64_t*)&(rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride);
-      *(int64_t*)&(rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride);
-      *(int64_t*)&(rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
+      *(int64_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
+      *(int64_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
+      *(int64_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
+      *(int64_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
+      *(int64_t*)& (rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride);
+      *(int64_t*)& (rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride);
+      *(int64_t*)& (rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride);
+      *(int64_t*)& (rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
       break;
     default:
-      for (int y = 0; y < width; ++y) {
+      for (int y = 0; y < height; ++y) {
         for (int x = 0; x < width; x += 16) {
-          *(int64_t*)&(rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y  * in_stride);
-          *(int64_t*)&(rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y  * in_stride);
+          *(int64_t*)& (rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride);
+          *(int64_t*)& (rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride);
         }
       }
       break;
+    }
+  }
+  else {
+    switch (width) {
+    case 4:
+      for (int y = 0; y < height; y += 4) {
+        *(int32_t*)& (rec_out[(y + 0) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 0) * width, pred_in + (y + 0) * in_stride);
+        *(int32_t*)& (rec_out[(y + 1) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 1) * width, pred_in + (y + 1) * in_stride);
+        *(int32_t*)& (rec_out[(y + 2) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 2) * width, pred_in + (y + 2) * in_stride);
+        *(int32_t*)& (rec_out[(y + 3) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 3) * width, pred_in + (y + 3) * in_stride);
+      }
+      break;
+    case 8:
+      for (int y = 0; y < height; ++y) {
+        *(int32_t*)& (rec_out[y * out_stride]) = get_quantized_recon_8x1_avx2(residual + y * width, pred_in + y * in_stride);
+      }
+      break;
+    default:
+      for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+          int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
+          rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
+        }
+      }
+      break;
+    }
   }
 }
 
@@ -626,7 +657,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_avx2(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uint8_t *const ref_in, const uint8_t *const pred_in,
@@ -637,15 +668,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   // Temporary arrays to pass data to and from uvg_quant and transform functions.
   ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
-  const int height = width; // TODO: height for non-square blocks
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  
   int has_coeffs = 0;
 
   assert(width <= TR_MAX_WIDTH);
   assert(width >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
     int y, x;
@@ -662,40 +693,51 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
   }
   else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
   const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
 
   if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
     // Forward low frequency non-separable transform
-    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
   }
 
   // Quantize coeffs. (coeff -> coeff_out)
-  
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if(!use_trskip && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      color,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, color,
-      scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
+    uvg_rdoq(state, coeff, coeff_out, width, height, color,
+      scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
   }
   else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order);
   }
   else {
-    uvg_quant(state, coeff, coeff_out, width, width, color,
+    uvg_quant(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
   }
 
   // Check if there are any non-zero coefficients.
-  for (int i = 0; i < width * width; i += 8) {
+  for (int i = 0; i < width * height; i += 8) {
     __m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(coeff_out[i]));
     has_coeffs = !_mm_testz_si128(_mm_set1_epi8(0xFF), v_quant_coeff);
     if(has_coeffs) break;
@@ -705,25 +747,25 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   // rec_out.
   if (has_coeffs && !early_skip) {
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, color,
+    uvg_dequant(state, coeff_out, coeff, width, height, color,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
 
     if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
       // Inverse low frequency non-separable transform
-      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
     }
     if (use_trskip) {
-      uvg_itransformskip(state->encoder_control, residual, coeff, width);
+      uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
     }
     else {
-      uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+      uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
     }
 
     if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
       int y, x;
       int sign, absval;
       int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-      for (y = 0; y < width; ++y) {
+      for (y = 0; y < height; ++y) {
         for (x = 0; x < width; ++x) {
           residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
           sign = residual[x + y * width] >= 0 ? 1 : -1;
@@ -739,14 +781,14 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width);
+    get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width, height);
   }
   else if (rec_out != pred_in) {
     // With no coeffs and rec_out == pred_int we skip copying the coefficients
     // because the reconstruction is just the prediction.
     int y, x;
 
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
       }
@@ -763,20 +805,26 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
+  if (encoder->cfg.dep_quant && !transform_skip) {
+    uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
+    return;
+  }
   int32_t shift,add,coeff_q;
   int32_t n;
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
+  const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size// Represents scaling through forward transform
 
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
 
   if (encoder->scaling_list.enable)
   {
-    uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
-    uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
     int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
 
     const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6];
@@ -797,7 +845,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
       }
     }
   } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
     __m256i v_scale = _mm256_set1_epi32(scale);
@@ -845,8 +893,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
   return parts[0] + parts[1] + parts[2] + parts[3];
 }
 
-static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
   const __m256i zero           = _mm256_setzero_si256();
   const __m256i threes         = _mm256_set1_epi16(3);
   const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
@@ -863,7 +912,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64
   __m256i wts_lo     = _mm256_broadcastsi128_si256(wts_lo_128);
   __m256i wts_hi     = _mm256_broadcastsi128_si256(wts_hi_128);
 
-  for (int i = 0; i < width * width; i += 32) {
+  for (int i = 0; i < width * height; i += 32) {
     __m256i curr_lo      = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
     __m256i curr_abs_lo  = _mm256_abs_epi16   (curr_lo);
     __m256i curr_max3_lo = _mm256_min_epu16   (curr_abs_lo, threes);
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index cd05a01f..ccddf17a 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -771,6 +771,12 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
 
 
 // DCT-2
+#define DEFINE_DCT2_P2_MATRIX(a) \
+{ \
+   a,  a, \
+   a, -a  \
+}
+
 #define DEFINE_DCT2_P4_MATRIX(a,b,c) \
 { \
    a,  a,  a,  a, \
@@ -1002,6 +1008,7 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
 }
 
 // DCT-2
+const int16_t uvg_g_DCT2P2[4] = DEFINE_DCT2_P2_MATRIX(64);
 const int16_t uvg_g_DCT2P4[16] = DEFINE_DCT2_P4_MATRIX(64, 83, 36);
 const int16_t uvg_g_DCT2P8[64] = DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18);
 const int16_t uvg_g_DCT2P16[256] = DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9);
@@ -1020,6 +1027,68 @@ const int16_t uvg_g_DCT8P16[256] = DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77
 const int16_t uvg_g_DCT8P32[1024] = DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4);
 
 // ********************************** DCT-2 **********************************
+static void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  int32_t j;
+  int32_t E, O;
+  int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0;
+
+  const int16_t* iT = uvg_g_DCT2P2;
+
+  int16_t *p_coef = dst;
+  const int  reduced_line = line - skip_line;
+  for (j = 0; j < reduced_line; j++)
+  {
+    /* E and O */
+    E = src[0] + src[1];
+    O = src[0] - src[1];
+
+    dst[0] = (iT[0] * E + add) >> shift;
+    dst[line] = (iT[2] * O + add) >> shift;
+
+
+    src += 2;
+    dst++;
+  }
+  if (skip_line)
+  {
+    dst = p_coef + reduced_line;
+    for (j = 0; j < 2; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_line);
+      dst += line;
+    }
+  }
+}
+
+static void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2)
+{
+  int32_t j;
+  int32_t E, O;
+  int32_t add = 1 << (shift - 1);
+
+  const int16_t* iT = uvg_g_DCT2P2;
+
+  const int  reduced_line = line - skip_line;
+  for (j = 0; j < reduced_line; j++)
+  {
+    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+    E = iT[0] * (src[0] + src[line]);
+    O = iT[2] * (src[0] - src[line]);
+
+    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
+    dst[0] = (short)CLIP(-32768, 32767, (E + add) >> shift);
+    dst[1] = (short)CLIP(-32768, 32767, (O + add) >> shift);
+
+    src++;
+    dst += 2;
+  }
+  if (skip_line)
+  {
+    memset(dst, 0, (skip_line << 1) * sizeof(int16_t));
+  }
+}
+
 static void fastForwardDCT2_B4(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
 {
   int32_t j;
@@ -1366,11 +1435,6 @@ static void fastForwardDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift,
       dst += line;
     }
   }
-  if (skip_line2) {
-    const int  reduced_line = line - skip_line2;
-    dst = p_coef + reduced_line * 32;
-    memset(dst, 0, skip_line2 * 32 * sizeof(coeff_t));
-  }
 }
 
 static void fastInverseDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
@@ -2417,16 +2481,16 @@ DCT_MTS_NXN_GENERIC(DST1, 32);
 typedef void partial_tr_func(const int16_t*, int16_t*, int32_t, int, int, int);
 
 // ToDo: Enable MTS 2x2 and 64x64 transforms
-static partial_tr_func* dct_table[3][5] = {
-  { fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
-  { fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
-  { fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
+static partial_tr_func* dct_table[3][6] = {
+  { fastForwardDCT2_B2, fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
+  { NULL,               fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
+  { NULL,               fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
 };
 
-static partial_tr_func* idct_table[3][5] = {
-  { fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
-  { fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
-  { fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
+static partial_tr_func* idct_table[3][6] = {
+  { fastInverseDCT2_B2, fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
+  { NULL,               fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
+  { NULL,               fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
 };
 
 
@@ -2436,11 +2500,12 @@ static const tr_type_t mts_subset_intra[4][2] = { { DST7, DST7 }, { DCT8, DST7 }
 
 void uvg_get_tr_type(
   int8_t width,
+  int8_t height,
   color_t color,
   const cu_info_t* tu,
   tr_type_t* hor_out,
   tr_type_t* ver_out,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   *hor_out = DCT2;
   *ver_out = DCT2;
@@ -2450,13 +2515,19 @@ void uvg_get_tr_type(
     return;
   }
 
-  const int height = width;
-  const bool explicit_mts = mts_idx == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_idx == UVG_MTS_INTRA : (mts_idx == UVG_MTS_INTER && tu->type == CU_INTER));
-  const bool implicit_mts = tu->type == CU_INTRA && (mts_idx == UVG_MTS_IMPLICIT || mts_idx == UVG_MTS_INTER);
+  const bool explicit_mts = mts_type == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : (mts_type == UVG_MTS_INTER && tu->type == CU_INTER));
+  const bool implicit_mts = tu->type == CU_INTRA && (mts_type == UVG_MTS_IMPLICIT || mts_type == UVG_MTS_INTER);
 
   assert(!(explicit_mts && implicit_mts));
+  const bool is_isp = tu->type == CU_INTRA && tu->intra.isp_mode && color == COLOR_Y ? tu->intra.isp_mode : 0;
+  const int8_t lfnst_idx = color == COLOR_Y ? tu->lfnst_idx : tu->cr_lfnst_idx;
+  // const bool is_sbt = cu->type == CU_INTER && tu->sbt && color == COLOR_Y; // TODO: check SBT here when implemented
 
-  if (implicit_mts)
+  if (is_isp && lfnst_idx) {
+    return;
+  }
+
+  if (implicit_mts || (is_isp && explicit_mts))
   {
     bool width_ok = width >= 4 && width <= 16;
     bool height_ok = height >= 4 && height <= 16;
@@ -2472,6 +2543,10 @@ void uvg_get_tr_type(
     return;
   }
 
+  /*
+  TODO: SBT HANDLING
+  */
+
   if (explicit_mts)
   {
     if (tu->tr_idx > MTS_SKIP) {
@@ -2487,27 +2562,31 @@ static void mts_dct_generic(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
-    dct_func *dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
   else
   {
-    const int height = width;
     int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
     int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-    if(tu->lfnst_idx || tu->cr_lfnst_idx) {
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    //const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    //const int log2_height_minus2 = uvg_g_convert_to_bit[height];
+
+    if((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4))
       {
         skip_width = width - 4;
@@ -2520,15 +2599,20 @@ static void mts_dct_generic(
       }
     }
 
-    partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2];
+    partial_tr_func* dct_hor = width != 1 ? dct_table[type_hor][log2_width_minus1] : NULL;
+    partial_tr_func* dct_ver = height != 1 ? dct_table[type_ver][log2_height_minus1] : NULL;
 
     int16_t tmp[32 * 32];
-    const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
-    const int32_t shift_2nd = log2_width_minus2 + 8;
-
-    dct_hor(input, tmp, shift_1st, height, 0, skip_width);
-    dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
+    const int32_t shift_1st = log2_width_minus1 + bitdepth - 8;
+    const int32_t shift_2nd = log2_height_minus1 + 7;
+    if (height == 1) {
+      dct_hor(input, output, shift_1st, height, 0, skip_width);
+    } else if (width == 1) {
+      dct_ver(input, output, log2_height_minus1 + 1 + bitdepth + 6 - 15, width, 0, skip_height);
+    } else {
+      dct_hor(input, tmp, shift_1st, height, 0, skip_width);
+      dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
+    }    
   }
 }
 
@@ -2538,36 +2622,57 @@ static void mts_idct_generic(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
 
-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
 
-  if (type_hor == DCT2 && type_ver == DCT2)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
   {
-    dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
+    dct_func *idct_func = uvg_get_idct_func(width, height, color, tu->type);
     idct_func(bitdepth, input, output);
   }
   else
   {
-    const int height = width;
-    const int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
-    const int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+    int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
 
-    partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2];
-    partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2];
+    if ((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
+      if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
+        skip_width = width - 4;
+        skip_height = height - 4;
+      }
+      else if ((width >= 8 && height >= 8)) {
+        skip_width = width - 8;
+        skip_height = height - 8;
+      }
+    }
+
+    partial_tr_func* idct_hor = width != 1 ? idct_table[type_hor][log2_width_minus1] : NULL;
+    partial_tr_func* idct_ver = height != 1 ? idct_table[type_ver][log2_height_minus1] : NULL;
 
     int16_t tmp[32 * 32];
-    const int32_t shift_1st = 7;
-    const int32_t shift_2nd = 20 - bitdepth;
+    const int max_log2_tr_dynamic_range = 15;
+    const int transform_matrix_shift = 6;
 
-    idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
-    idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
+    const int32_t shift_1st = transform_matrix_shift + 1;
+    const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth;
+
+    if (height == 1) {
+      idct_hor(input, output, shift_2nd + 1, height, 0, skip_width);
+    } else if (width == 1) {
+      idct_ver(input, output, shift_2nd + 1, width, 0, skip_height);
+    } else {
+      idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
+      idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
+    }
   }
 }
 
@@ -2582,6 +2687,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
   success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
   success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
   success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
+  //success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);
 
   success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);
 
diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c
new file mode 100644
index 00000000..b15ef52b
--- /dev/null
+++ b/src/strategies/generic/depquant-generic.c
@@ -0,0 +1,252 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/generic/depquant-generic.h"
+
+#include "dep_quant.h"
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "strategyselector.h"
+#include "transform.h"
+#include "uvg_math.h"
+#include "generic/quant-generic.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+  {32768,  65536,  98304,  131072, 163840, 196608, 262144, 262144,
+   327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216,
+   393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752,
+   458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+  {65536,  65536,  98304,  98304,  131072, 131072, 163840, 163840,
+   196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912,
+   360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448,
+   425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+  {98304,  98304,  98304,  98304,  131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144,
+   327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+  {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840,
+   196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision* decision, 
+  const all_depquant_states* const state,
+  int decision_id, 
+  int skip_offset) {
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block*               qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void uvg_dep_quant_decide_and_update_generic(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
+{
+  const int default_quant_coeff = dep_quant_context->m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context->m_quant->m_thresLast;
+  int temp = *firstTestPos;
+  for (; temp >= 0; (temp)--) {
+    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff));
+    if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+      break;
+    }
+  }
+  *firstTestPos = temp;
+}
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+  
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic);
+  success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic);
+
+  return success;
+}
diff --git a/src/strategies/generic/depquant-generic.h b/src/strategies/generic/depquant-generic.h
new file mode 100644
index 00000000..488963be
--- /dev/null
+++ b/src/strategies/generic/depquant-generic.h
@@ -0,0 +1,50 @@
+#ifndef STRATEGIES_DEPQUANT_GENERIC_H_
+#define STRATEGIES_DEPQUANT_GENERIC_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "tables.h"
+
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_GENERIC_H_
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 67685f2f..c3065903 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -54,11 +54,16 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   cabac_data_t * const cabac,
   const coeff_t *coeff,
-  uint8_t width,
+  const cu_loc_t * const cu_loc,
   uint8_t color,
   int8_t scan_mode,
   cu_info_t* cur_cu,
-  double* bits_out) {
+  double* bits_out) 
+{
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
 
   //const encoder_control_t * const encoder = state->encoder_control;
   //int c1 = 1;
@@ -75,12 +80,12 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
   // CONSTANTS
 
-  const int height = width; // TODO: height for non-square blocks.
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width]+2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t *scan =
-    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint8_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint8_t log2_block_height = uvg_g_convert_to_log2[height];
+  
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
 
 
   // Init base contexts according to block type
@@ -90,12 +95,13 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   unsigned scan_cg_last = (unsigned)-1;
   unsigned scan_pos_last = (unsigned)-1;
 
-  for (int i = 0; i < width * width; i++) {
+  for (int i = 0; i < (width * height); ++i) {
     if (coeff[scan[i]]) {
       scan_pos_last = i;
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
   }
+
   scan_cg_last = scan_pos_last >> log2_cg_size;
 
   int pos_last = scan[scan_pos_last];
@@ -120,28 +126,33 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
     last_coeff_x,
     last_coeff_y,
     width,
-    width,
+    height,
     color,
     scan_mode,
     bits_out);
 
 
 
-  uint32_t quant_state_transition_table = 0; //ToDo: dep quant enable changes this
+  uint32_t quant_state_transition_table = state->encoder_control->cfg.dep_quant ? 32040 : 0; 
   int32_t quant_state = 0;
   uint8_t  ctx_offset[16];
   int32_t temp_diag = -1;
   int32_t temp_sum = -1;
 
-  int32_t reg_bins = (width*width * 28) >> 4; //8 for 2x2
+  int32_t reg_bins = (width * height * 28) >> 4; //8 for 2x2
 
   // significant_coeff_flag
   for (i = scan_cg_last; i >= 0; i--) {
 
     //int32_t abs_coeff[64*64];
+    const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
+    const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+    const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
+    const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
     int32_t cg_blk_pos = scan_cg[i];
-    int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
-    int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
+    int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> log2_cg_width);
+    int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> log2_cg_width));
+
 
     // !!! residual_coding_subblock() !!!
 
@@ -151,7 +162,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
     } else {
       uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0);
       uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-        cg_pos_y, (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
+        cg_pos_y, cg_width, cg_height);
       CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctx[ctx_sig], sig_coeff_group, bits, "significant_coeffgroup_flag");
     }
 
@@ -182,7 +193,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
         sig = (coeff[blk_pos] != 0) ? 1 : 0;
         if (num_non_zero || next_sig_pos != infer_sig_pos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
           cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]);
           cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]);
 
@@ -190,7 +201,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
           reg_bins--;
 
         } else if (next_sig_pos != scan_pos_last) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
         }
 
 
@@ -256,7 +267,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
         blk_pos = scan[scan_pos];
         pos_y = blk_pos / width;
         pos_x = blk_pos - (pos_y * width);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4);
 
         rice_param = g_go_rice_pars[abs_sum];
         uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]);
@@ -274,7 +285,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
         pos_y = blk_pos / width;
         pos_x = blk_pos - (pos_y * width);
         uint32_t coeff_abs = abs(coeff[blk_pos]);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0);
         rice_param = g_go_rice_pars[abs_sum];        
         pos0 = ((quant_state<2)?1:2) << rice_param;
         uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs);
@@ -291,7 +302,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
       uint32_t num_signs = num_non_zero;
 
-      if (state->encoder_control->cfg.signhide_enable && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
+      if (state->encoder_control->cfg.signhide_enable && !state->encoder_control->cfg.dep_quant && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
         num_signs--;
         coeff_signs >>= 1;
       }
diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h
index 8cfe497d..0de02e3c 100644
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@@ -44,7 +44,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                   cabac_data_t * const cabac,
                                   const coeff_t *coeff,
-                                  uint8_t width,
+                                  const cu_loc_t * const loc,
                                   uint8_t color,
                                   int8_t scan_mode,
                                   cu_info_t* cur_cu,
diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c
index 35494b99..398388fc 100644
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@@ -34,6 +34,7 @@
 
 #include <stdlib.h>
 
+#include "cu.h"
 #include "intra.h"
 #include "uvg266.h"
 #include "strategyselector.h"
@@ -42,25 +43,32 @@
 
 /**
  * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
  * \param intra_mode    Angular mode in range 2..34.
+ * \param channel_type  Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
- * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=height*2+1.
  * \param dst           Buffer of size width*width.
  * \param multi_ref_idx Multi reference line index for use with MRL.
  */
 static void uvg_angular_pred_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const int_fast8_t intra_mode,
   const int_fast8_t channel_type,
   const uvg_pixel *const in_ref_above,
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode,
+  const int cu_dim)
 {
+  int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
   
-  assert(log2_width >= 2 && log2_width <= 5);
-  assert(intra_mode >= 2 && intra_mode <= 66);
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
+  // assert(intra_mode >= 2 && intra_mode <= 66);
 
   static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
   static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp
@@ -105,126 +113,105 @@ static void uvg_angular_pred_generic(
                                                     // Temporary buffer for modes 11-25.
                                                     // It only needs to be big enough to hold indices from -width to width-1.
 
+  uvg_pixel temp_dst[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+
   // TODO: check the correct size for these arrays when MRL is used
   //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;
+  uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
+  uvg_pixel temp_left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
 
   uint32_t pred_mode = intra_mode; // ToDo: handle WAIP
 
   uint8_t multi_ref_index = multi_ref_idx;
+  uint8_t isp = isp_mode;
 
   // Whether to swap references to always project on the left reference row.
   const bool vertical_mode = intra_mode >= 34;
   // Modes distance to horizontal or vertical mode.
   const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -((int32_t)pred_mode - 18);
-  //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
   
   // Sample displacement per column in fractions of 32.
-  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
+  const int16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
   
-  // TODO: replace latter width with height
-  int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);
+  const int side_size = vertical_mode ? log2_height : log2_width;
+  int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]);
 
   // Pointer for the reference we are interpolating from.
   uvg_pixel *ref_main;
   // Pointer for the other reference.
   const uvg_pixel *ref_side;
+  uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst;
+  
+  const int top_ref_length  = isp_mode == ISP_MODE_VER ? width + cu_dim  : width << 1;
+  const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1;
 
   // Set ref_main and ref_side such that, when indexed with 0, they point to
   // index 0 in block coordinates.
   if (sample_disp < 0) {
+    memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel));
 
-    // TODO: for non-square blocks, separate loops for x and y is needed
-    for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
-      temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
+    ref_main = vertical_mode ? temp_above + height : temp_left + width;
+    ref_side = vertical_mode ? temp_left + width : temp_above + height;
+
+    int size_side = vertical_mode ? height : width;
+    for (int i = -size_side; i <= -1; i++) {
+      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)];
     }
-
-    // TODO: take into account non-square blocks
-    ref_main = temp_main + width;
-    ref_side = temp_side + width;
-
-    // TODO: for non square blocks, need to check if width or height is used for reference extension
-    for (int i = -width; i <= -1; i++) {
-      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)];
-    }
-
-    //const uint32_t index_offset = width + 1;
-    //const int32_t last_index = width;
-    //const int_fast32_t most_negative_index = (width * sample_disp) >> 5;
-    //// Negative sample_disp means, we need to use both references.
-
-    //// TODO: update refs to take into account variating block size and shapes
-    ////       (height is not always equal to width)
-    //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
-    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
-
-    //// Move the reference pixels to start from the middle to the later half of
-    //// the tmp_ref, so there is room for negative indices.
-    //for (int_fast32_t x = -1; x < width; ++x) {
-    //  tmp_ref[x + index_offset] = ref_main[x];
-    //}
-    //// Get a pointer to block index 0 in tmp_ref.
-    //ref_main = &tmp_ref[index_offset];
-    //tmp_ref[index_offset -1] = tmp_ref[index_offset];
-
-    //// Extend the side reference to the negative indices of main reference.
-    //int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
-    //int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
-    //// TODO: add 'vertical_mode ? height : width' instead of 'width'
-    //
-    //for (int_fast32_t x = -1; x > most_negative_index; x--) {
-    //  col_sample_disp += inv_abs_sample_disp;
-    //  int_fast32_t side_index = col_sample_disp >> 8;
-    //  tmp_ref[x + index_offset - 1] = ref_side[side_index - 1];
-    //}
-    //tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1];
-    //tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset];
   }
   else {
-    
-    // TODO: again, separate loop needed for non-square blocks
-    for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
-      temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }
+    memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
 
-    // TODO: this code block will need to change also when non-square blocks are used
-    // const int log2_ratio = 0;
-    const int s = 0;
+    ref_main = vertical_mode ? temp_above : temp_left;
+    ref_side = vertical_mode ? temp_left : temp_above;
+
+    const int log2_ratio = log2_width - log2_height;
+    const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
     const int max_index = (multi_ref_index << s) + 2;
-    const int ref_length = width << 1;
-    const uvg_pixel val = temp_main[ref_length + multi_ref_index];
-    for (int j = 1; j <= max_index; j++) {
-      temp_main[ref_length + multi_ref_index +  j] = val;
+    int ref_length;
+    if (isp_mode) {
+      ref_length = vertical_mode ? top_ref_length : left_ref_length;
+    }
+    else {
+      ref_length = vertical_mode ? width << 1 : height << 1;
+    }
+    const uvg_pixel val = ref_main[ref_length + multi_ref_index];
+    for (int j = 1; j <= max_index; j++) {
+      ref_main[ref_length + multi_ref_index +  j] = val;
     }
-
-    ref_main = temp_main;
-    ref_side = temp_side;
-    //// sample_disp >= 0 means we don't need to refer to negative indices,
-    //// which means we can just use the references as is.
-    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
-    //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
-
-    //memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(uvg_pixel));
-    //ref_main = &tmp_ref[width];
-    //tmp_ref[width-1] = tmp_ref[width];
-    //int8_t last_index = 1 + width*2;
-    //tmp_ref[width + last_index] = tmp_ref[width + last_index - 1];
   }
 
+
   // compensate for line offset in reference line buffers
   ref_main += multi_ref_index;
   ref_side += multi_ref_index;
+  if (!vertical_mode) { SWAP(width, height, int) }
 
   if (sample_disp != 0) {
+    bool use_cubic = true; // Default to cubic filter
+    static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
+    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
+    int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
+    if (dist_from_vert_or_hor > filter_threshold) {
+      if ((abs(sample_disp) & 0x1F) != 0)
+      {
+        use_cubic = false;
+      }
+    }
+    // Cubic must be used if ref line != 0 or if isp mode is != 0
+    if (multi_ref_index || isp) {
+      use_cubic = true;
+    }
     // The mode is not horizontal or vertical, we have to do interpolation.
 
-    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < width; ++y, delta_pos += sample_disp) {
+    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
+
       int_fast32_t delta_int = delta_pos >> 5;
       int_fast32_t delta_fract = delta_pos & (32 - 1);
+      const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
+      int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
 
       if ((abs(sample_disp) & 0x1F) != 0) {
         
@@ -232,25 +219,7 @@ static void uvg_angular_pred_generic(
         if (channel_type == 0) {
           int32_t ref_main_index = delta_int;
           uvg_pixel p[4];
-          bool use_cubic = true; // Default to cubic filter
-          static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
-          int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
-          int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
-          if (dist_from_vert_or_hor > filter_threshold) {
-            static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
-            const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
-            const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
-            if ((abs(sample_disp) & 0x1F) != 0)
-            {
-              use_cubic = false;
-            }
-          }
-          // Cubic must be used if ref line != 0
-          if (multi_ref_index) {
-            use_cubic = true;
-          }
-          const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
-          int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
+
           // Do 4-tap intra interpolation filtering
           for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
             p[0] = ref_main[ref_main_index];
@@ -258,7 +227,7 @@ static void uvg_angular_pred_generic(
             p[2] = ref_main[ref_main_index + 2];
             p[3] = ref_main[ref_main_index + 3];
          
-            dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
+            work[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
 
           }
         }
@@ -268,99 +237,79 @@ static void uvg_angular_pred_generic(
           for (int_fast32_t x = 0; x < width; ++x) {
             uvg_pixel ref1 = ref_main[x + delta_int + 1];
             uvg_pixel ref2 = ref_main[x + delta_int + 2];
-            dst[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
+            work[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
           }
         }
       }
       else {
         // Just copy the integer samples
         for (int_fast32_t x = 0; x < width; x++) {
-          dst[y * width + x] = ref_main[x + delta_int + 1];
+          work[y * width + x] = ref_main[x + delta_int + 1];
         }
       }
 
      
       // PDPC
-      bool PDPC_filter = (width >= 4 || channel_type != 0);
+      bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0;
       if (pred_mode > 1 && pred_mode < 67) {
         if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
           PDPC_filter = false;
         }
         else if (mode_disp > 0) {
-          PDPC_filter = (scale >= 0);
+          PDPC_filter &= (scale >= 0);
         }
       }
       if(PDPC_filter) {
-        int       inv_angle_sum = 256;
+        int inv_angle_sum = 256;
         for (int x = 0; x < MIN(3 << scale, width); x++) {
           inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];
 
           int wL = 32 >> (2 * x >> scale);
           const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1];
-          dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6);
+          work[y * width + x] = work[y * width + x] + ((wL * (left - work[y * width + x]) + 32) >> 6);
         }
       }
-
-        /*
-      if (pred_mode == 2 || pred_mode == 66) {
-        int wT = 16 >> MIN(31, ((y << 1) >> scale));
-        for (int x = 0; x < width; x++) {
-          int wL = 16 >> MIN(31, ((x << 1) >> scale));
-          if (wT + wL == 0) break;
-          int c = x + y + 1;
-          if (c >= 2 * width) { wL = 0; }
-          if (c >= 2 * width) { wT = 0; }
-          const uvg_pixel left = (wL != 0) ? ref_side[c] : 0;
-          const uvg_pixel top  = (wT != 0) ? ref_main[c] : 0;
-          dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6);
-        }
-      } else if (sample_disp == 0 || sample_disp >= 12) {
-        int inv_angle_sum_0 = 2;
-        for (int x = 0; x < width; x++) {
-          inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)];
-          int delta_pos_0 = inv_angle_sum_0 >> 2;
-          int delta_frac_0 = delta_pos_0 & 63;
-          int delta_int_0 = delta_pos_0 >> 6;
-          int delta_y = y + delta_int_0 + 1;
-          // TODO: convert to JVET_K0500_WAIP
-          if (delta_y > width + width - 1) break;
-
-          int wL = 32 >> MIN(31, ((x << 1) >> scale));
-          if (wL == 0) break;
-          const uvg_pixel *p = ref_side + delta_y - 1;
-          uvg_pixel left = p[delta_frac_0 >> 5];
-          dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6);
-        }
-      }*/
     }
   }
   else {
     // Mode is horizontal or vertical, just copy the pixels.
+    
+    // Do not apply PDPC if multi ref line index is other than 0
+    // TODO: do not do PDPC if block is in BDPCM mode
+    bool do_pdpc = ((width >= 4 && height >= 4) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
 
-    // TODO: update outer loop to use height instead of width
-    for (int_fast32_t y = 0; y < width; ++y) {
-      for (int_fast32_t x = 0; x < width; ++x) {
-        dst[y * width + x] = ref_main[x + 1];
-      }
-      // Do not apply PDPC if multi ref line index is other than 0
-      if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
-        int scale = (log2_width + log2_width - 2) >> 2;
-        const uvg_pixel top_left = ref_main[0];
+    if (do_pdpc) {
+      int scale = (log2_width + log2_height - 2) >> 2;
+      const uvg_pixel top_left = ref_main[0];
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
         const uvg_pixel left = ref_side[1 + y];
-        for (int i = 0; i < MIN(3 << scale, width); i++) {
-          const int wL = 32 >> (2 * i >> scale);
-          const uvg_pixel val = dst[y * width + i];
-          dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+        for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
+          const int wL = 32 >> (2 * x >> scale);
+          const uvg_pixel val = work[y * width + x];
+          work[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
         }
       }
+    } else {
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
+      }
     }
   }
 
   // Flip the block if this is was a horizontal mode.
   if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < width - 1; ++y) {
-      for (int_fast32_t x = y + 1; x < width; ++x) {
-        SWAP(dst[y * width + x], dst[x * width + y], uvg_pixel);
+    if(width == height) {
+      for (int_fast32_t y = 0; y < height - 1; ++y) {
+        for (int_fast32_t x = y + 1; x < width; ++x) {
+          SWAP(work[y * height + x], work[x * width + y], uvg_pixel);
+        }
+      }
+    } else {
+      for(int y = 0; y < width; ++y) {
+        for(int x = 0; x < height; ++x) {
+          dst[x + y * height] = work[y + x * width];
+        }
       }
     }
   }
@@ -369,23 +318,32 @@ static void uvg_angular_pred_generic(
 
 /**
  * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
  * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
  * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
  * \param dst           Buffer of size width*width.
  */
 static void uvg_intra_pred_planar_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
   const uvg_pixel *const ref_top,
   const uvg_pixel *const ref_left,
   uvg_pixel *const dst)
 {
-  // TODO: Add height
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  const int offset = 1 << (log2_width + log2_height);
+  const int final_shift = 1 + log2_width + log2_height;
+  
+  // If ISP is enabled log_dim 1 is possible (limit was previously 2)
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
 
-  const int_fast8_t width = 1 << log2_width;
   const uvg_pixel top_right = ref_top[width + 1];
-  const uvg_pixel bottom_left = ref_left[width + 1];
+  const uvg_pixel bottom_left = ref_left[height + 1];
 
 #if 0
   // Unoptimized version for reference.
@@ -397,18 +355,27 @@ static void uvg_intra_pred_planar_generic(
     }
   }
 #else
-  int_fast16_t top[32];
+  // TODO: get rid of magic numbers. Make a define for this
+  int_fast16_t top[64];
+  int_fast16_t bottom[64];
+  int_fast16_t left[64];
+  int_fast16_t right[64];
   for (int i = 0; i < width; ++i) {
-    top[i] = ref_top[i + 1] << log2_width;
+    bottom[i] = bottom_left - ref_top[i + 1];
+    top[i] = ref_top[i + 1] << log2_height;
   }
 
-  for (int y = 0; y < width; ++y) {
-    int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
+  for (int j = 0; j < height; ++j) {
+    right[j] = top_right - ref_left[j + 1];
+    left[j] = ref_left[j + 1] << log2_width;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    int_fast16_t hor = left[y];
     for (int x = 0; x < width; ++x) {
-      hor += top_right - ref_left[y + 1];
-      top[x] += bottom_left - ref_top[x + 1];
-      dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
-      //
+      hor += right[y];
+      top[x] += bottom[x];
+      dst[y * width + x] = ((hor << log2_height) + (top[x] << log2_width) + offset) >> final_shift;
     }
   }
 #endif
@@ -461,25 +428,26 @@ static void uvg_intra_pred_filtered_dc_generic(
 
 /**
 * \brief Position Dependent Prediction Combination for Planar and DC modes.
-* \param log2_width    Log2 of width, range 2..5.
-* \param width         Block width matching log2_width.
+* \param cu_loc        CU location and size data.
 * \param used_ref      Pointer used reference pixel struct.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_pdpc_planar_dc_generic(
   const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst)
 {
   assert(mode == 0 || mode == 1);  // planar or DC
+  const int width =  color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
 
-  // TODO: replace latter log2_width with log2_height
-  const int scale = ((log2_width - 2 + log2_width - 2 + 2) >> 2);
+  const int scale = (log2_width + log2_height - 2) >> 2;
 
-  // TODO: replace width with height
-  for (int y = 0; y < width; y++) {
+  for (int y = 0; y < height; y++) {
     int wT = 32 >> MIN(31, ((y << 1) >> scale));
     for (int x = 0; x < width; x++) {
       int wL = 32 >> MIN(31, ((x << 1) >> scale));
diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c
index 817befed..5e06ebbe 100644
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@@ -32,6 +32,7 @@
 
 #include "strategies/generic/picture-generic.h"
 
+#include <math.h>
 #include <stdlib.h>
 
 #include "strategies/strategies-picture.h"
@@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel)
 
 SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
 
+static uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  uint64_t satd = 0;
+  coeff_t diff[4], m[4];
+
+  diff[0] = piOrg[0] - piCur[0];
+  diff[1] = piOrg[1] - piCur[1];
+  diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur];
+  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
+  m[0] = diff[0] + diff[2];
+  m[1] = diff[1] + diff[3];
+  m[2] = diff[0] - diff[2];
+  m[3] = diff[1] - diff[3];
+
+  satd += abs(m[0] + m[1]) >> 2;
+  satd += abs(m[0] - m[1]);
+  satd += abs(m[2] + m[3]);
+  satd += abs(m[2] - m[3]);
+
+  return satd;
+}
+
+
+static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{   //need to add SIMD implementation ,JCA
+  int k, i, j, jj, sad = 0;
+  int diff[128], m1[8][16], m2[8][16];
+  for (k = 0; k < 128; k += 16)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    diff[k + 8] = piOrg[8] - piCur[8];
+    diff[k + 9] = piOrg[9] - piCur[9];
+    diff[k + 10] = piOrg[10] - piCur[10];
+    diff[k + 11] = piOrg[11] - piCur[11];
+    diff[k + 12] = piOrg[12] - piCur[12];
+    diff[k + 13] = piOrg[13] - piCur[13];
+    diff[k + 14] = piOrg[14] - piCur[14];
+    diff[k + 15] = piOrg[15] - piCur[15];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++)
+  {
+    jj = j << 4;
+
+    m2[j][0] = diff[jj] + diff[jj + 8];
+    m2[j][1] = diff[jj + 1] + diff[jj + 9];
+    m2[j][2] = diff[jj + 2] + diff[jj + 10];
+    m2[j][3] = diff[jj + 3] + diff[jj + 11];
+    m2[j][4] = diff[jj + 4] + diff[jj + 12];
+    m2[j][5] = diff[jj + 5] + diff[jj + 13];
+    m2[j][6] = diff[jj + 6] + diff[jj + 14];
+    m2[j][7] = diff[jj + 7] + diff[jj + 15];
+    m2[j][8] = diff[jj] - diff[jj + 8];
+    m2[j][9] = diff[jj + 1] - diff[jj + 9];
+    m2[j][10] = diff[jj + 2] - diff[jj + 10];
+    m2[j][11] = diff[jj + 3] - diff[jj + 11];
+    m2[j][12] = diff[jj + 4] - diff[jj + 12];
+    m2[j][13] = diff[jj + 5] - diff[jj + 13];
+    m2[j][14] = diff[jj + 6] - diff[jj + 14];
+    m2[j][15] = diff[jj + 7] - diff[jj + 15];
+
+    m1[j][0] = m2[j][0] + m2[j][4];
+    m1[j][1] = m2[j][1] + m2[j][5];
+    m1[j][2] = m2[j][2] + m2[j][6];
+    m1[j][3] = m2[j][3] + m2[j][7];
+    m1[j][4] = m2[j][0] - m2[j][4];
+    m1[j][5] = m2[j][1] - m2[j][5];
+    m1[j][6] = m2[j][2] - m2[j][6];
+    m1[j][7] = m2[j][3] - m2[j][7];
+    m1[j][8] = m2[j][8] + m2[j][12];
+    m1[j][9] = m2[j][9] + m2[j][13];
+    m1[j][10] = m2[j][10] + m2[j][14];
+    m1[j][11] = m2[j][11] + m2[j][15];
+    m1[j][12] = m2[j][8] - m2[j][12];
+    m1[j][13] = m2[j][9] - m2[j][13];
+    m1[j][14] = m2[j][10] - m2[j][14];
+    m1[j][15] = m2[j][11] - m2[j][15];
+
+    m2[j][0] = m1[j][0] + m1[j][2];
+    m2[j][1] = m1[j][1] + m1[j][3];
+    m2[j][2] = m1[j][0] - m1[j][2];
+    m2[j][3] = m1[j][1] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][6];
+    m2[j][5] = m1[j][5] + m1[j][7];
+    m2[j][6] = m1[j][4] - m1[j][6];
+    m2[j][7] = m1[j][5] - m1[j][7];
+    m2[j][8] = m1[j][8] + m1[j][10];
+    m2[j][9] = m1[j][9] + m1[j][11];
+    m2[j][10] = m1[j][8] - m1[j][10];
+    m2[j][11] = m1[j][9] - m1[j][11];
+    m2[j][12] = m1[j][12] + m1[j][14];
+    m2[j][13] = m1[j][13] + m1[j][15];
+    m2[j][14] = m1[j][12] - m1[j][14];
+    m2[j][15] = m1[j][13] - m1[j][15];
+
+    m1[j][0] = m2[j][0] + m2[j][1];
+    m1[j][1] = m2[j][0] - m2[j][1];
+    m1[j][2] = m2[j][2] + m2[j][3];
+    m1[j][3] = m2[j][2] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][5];
+    m1[j][5] = m2[j][4] - m2[j][5];
+    m1[j][6] = m2[j][6] + m2[j][7];
+    m1[j][7] = m2[j][6] - m2[j][7];
+    m1[j][8] = m2[j][8] + m2[j][9];
+    m1[j][9] = m2[j][8] - m2[j][9];
+    m1[j][10] = m2[j][10] + m2[j][11];
+    m1[j][11] = m2[j][10] - m2[j][11];
+    m1[j][12] = m2[j][12] + m2[j][13];
+    m1[j][13] = m2[j][12] - m2[j][13];
+    m1[j][14] = m2[j][14] + m2[j][15];
+    m1[j][15] = m2[j][14] - m2[j][15];
+  }
+
+  //vertical
+  for (i = 0; i < 16; i++)
+  {
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 16; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(16.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[128], m1[16][8], m2[16][8];
+  for (k = 0; k < 128; k += 8)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 16; j++)
+  {
+    jj = j << 3;
+
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++)
+  {
+    m1[0][i] = m2[0][i] + m2[8][i];
+    m1[1][i] = m2[1][i] + m2[9][i];
+    m1[2][i] = m2[2][i] + m2[10][i];
+    m1[3][i] = m2[3][i] + m2[11][i];
+    m1[4][i] = m2[4][i] + m2[12][i];
+    m1[5][i] = m2[5][i] + m2[13][i];
+    m1[6][i] = m2[6][i] + m2[14][i];
+    m1[7][i] = m2[7][i] + m2[15][i];
+    m1[8][i] = m2[0][i] - m2[8][i];
+    m1[9][i] = m2[1][i] - m2[9][i];
+    m1[10][i] = m2[2][i] - m2[10][i];
+    m1[11][i] = m2[3][i] - m2[11][i];
+    m1[12][i] = m2[4][i] - m2[12][i];
+    m1[13][i] = m2[5][i] - m2[13][i];
+    m1[14][i] = m2[6][i] - m2[14][i];
+    m1[15][i] = m2[7][i] - m2[15][i];
+
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+    m2[8][i] = m1[8][i] + m1[12][i];
+    m2[9][i] = m1[9][i] + m1[13][i];
+    m2[10][i] = m1[10][i] + m1[14][i];
+    m2[11][i] = m1[11][i] + m1[15][i];
+    m2[12][i] = m1[8][i] - m1[12][i];
+    m2[13][i] = m1[9][i] - m1[13][i];
+    m2[14][i] = m1[10][i] - m1[14][i];
+    m2[15][i] = m1[11][i] - m1[15][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+    m1[8][i] = m2[8][i] + m2[10][i];
+    m1[9][i] = m2[9][i] + m2[11][i];
+    m1[10][i] = m2[8][i] - m2[10][i];
+    m1[11][i] = m2[9][i] - m2[11][i];
+    m1[12][i] = m2[12][i] + m2[14][i];
+    m1[13][i] = m2[13][i] + m2[15][i];
+    m1[14][i] = m2[12][i] - m2[14][i];
+    m1[15][i] = m2[13][i] - m2[15][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+    m2[8][i] = m1[8][i] + m1[9][i];
+    m2[9][i] = m1[8][i] - m1[9][i];
+    m2[10][i] = m1[10][i] + m1[11][i];
+    m2[11][i] = m1[10][i] - m1[11][i];
+    m2[12][i] = m1[12][i] + m1[13][i];
+    m2[13][i] = m1[12][i] - m1[13][i];
+    m2[14][i] = m1[14][i] + m1[15][i];
+    m2[15][i] = m1[14][i] - m1[15][i];
+  }
+
+  for (i = 0; i < 16; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(16.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[32], m1[8][4], m2[8][4];
+  for (k = 0; k < 32; k += 4)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++)
+  {
+    jj = j << 2;
+    m2[j][0] = diff[jj] + diff[jj + 2];
+    m2[j][1] = diff[jj + 1] + diff[jj + 3];
+    m2[j][2] = diff[jj] - diff[jj + 2];
+    m2[j][3] = diff[jj + 1] - diff[jj + 3];
+
+    m1[j][0] = m2[j][0] + m2[j][1];
+    m1[j][1] = m2[j][0] - m2[j][1];
+    m1[j][2] = m2[j][2] + m2[j][3];
+    m1[j][3] = m2[j][2] - m2[j][3];
+  }
+
+  //vertical
+  for (i = 0; i < 4; i++)
+  {
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 4; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(4.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[32], m1[4][8], m2[4][8];
+  for (k = 0; k < 32; k += 8)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 4; j++)
+  {
+    jj = j << 3;
+
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++)
+  {
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+  }
+
+  for (i = 0; i < 4; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(4.0 * 8) * 2);
+
+  return sad;
+}
+
+
+static uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
+{
+  const uvg_pixel* piOrg = ref_in;
+  const uvg_pixel* piCur = pred_in;
+  const int  iRows = height;
+  const int  iCols = width;
+  const int  iStrideOrg = ref_stride;
+  const int  iStrideCur = pred_stride;
+
+  int  x = 0, y = 0;
+
+  uint64_t uiSum = 0;
+
+  if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0)
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 16)
+      {
+        uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 8;
+      piCur += iStrideCur * 8;
+    }
+  }
+  else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0)
+  {
+    for (y = 0; y < iRows; y += 16)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 16;
+      piCur += iStrideCur * 16;
+    }
+  }
+  else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0)
+  {
+    for (y = 0; y < iRows; y += 4)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 4;
+      piCur += iStrideCur * 4;
+    }
+  }
+  else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0)
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 4)
+      {
+        uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 8;
+      piCur += iStrideCur * 8;
+    }
+  }
+  else if ((iRows % 8 == 0) && (iCols % 8 == 0))
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
+      }
+      piOrg += 8 * iStrideOrg;
+      piCur += 8 * iStrideCur;
+    }
+  }
+  else if ((iRows % 4 == 0) && (iCols % 4 == 0))
+  {
+    for (y = 0; y < iRows; y += 4)
+    {
+      for (x = 0; x < iCols; x += 4)
+      {
+        uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
+      }
+      piOrg += 4 * iStrideOrg;
+      piCur += 4 * iStrideCur;
+    }
+  }
+  else if ((iRows % 2 == 0) && (iCols % 2 == 0))
+  {
+    for (y = 0; y < iRows; y += 2)
+    {
+      for (x = 0; x < iCols; x += 2)
+      {
+        uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += 2 * iStrideOrg;
+      piCur += 2 * iStrideCur;
+    }
+  }
+
+  // TODO: 10 bit
+  return (uiSum >> 0);
+}
+
+
 // Function macro for defining SAD calculating functions
 // for fixed size blocks.
 #define SAD_NXN(n, pixel_type) \
@@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel)
 
 static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec,
                  const int ref_stride, const int rec_stride,
-                 const int width)
+                 const int width, const int height)
 {
   int ssd = 0;
   int y, x;
 
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
       ssd += diff * diff;
@@ -783,10 +1355,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)
 
 
 static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, 
-  int width, int ref_stride, int pred_stride)
+  int width, int height, int ref_stride, int pred_stride)
 {
   int y, x;
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
     }
@@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
   success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
   success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
   success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
+  success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs);
   success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
 
   success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 96d2567a..e39b6c52 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -44,7 +44,6 @@
 #include "fast_coeff_cost.h"
 #include "reshape.h"
 
-#define QUANT_SHIFT 14
 /**
 * \brief quantize transformed coefficents
 *
@@ -62,22 +61,28 @@ void uvg_quant_generic(
   uint8_t lfnst_idx)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  uint32_t log2_tr_width = uvg_math_floor_log2(height);
-  uint32_t log2_tr_height = uvg_math_floor_log2(width);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+    
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
-  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift );
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+
   uint32_t ac_sum = 0;
 
+  const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
+
   if(lfnst_idx == 0){
     for (int32_t n = 0; n < width * height; n++) {
       int32_t level = coef[n];
@@ -86,7 +91,7 @@ void uvg_quant_generic(
 
       sign = (level < 0 ? -1 : 1);
 
-      int32_t curr_quant_coeff = quant_coeff[n];
+      int32_t curr_quant_coeff = use_scaling_list ? quant_coeff[n] : default_quant_coeff;
       level = (int32_t)((abs_level * curr_quant_coeff + add) >> q_bits);
       ac_sum += level;
 
@@ -237,6 +242,7 @@ int uvg_quant_cbcr_residual_generic(
   encoder_state_t* const state, 
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in, 
@@ -247,28 +253,28 @@ int uvg_quant_cbcr_residual_generic(
   uvg_pixel* v_rec_out,
   coeff_t* coeff_out,
   bool early_skip, 
-  int lmcs_chroma_adj, enum uvg_tree_type tree_type
-  ) {
+  int lmcs_chroma_adj, enum uvg_tree_type tree_type) 
+{
   ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
+  // TODO: this function is not fully converted to handle non-square blocks
   {
     int y, x;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
         v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
       }
     }
   }
-  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
-  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
+  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride);
+  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride);
   
   
   const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
-  for (int y = 0; y < width; y++)
+  for (int y = 0; y < height; y++)
   {
     for (int x = 0; x < width; x++)
     {
@@ -305,33 +311,44 @@ int uvg_quant_cbcr_residual_generic(
   }
 
 
-  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
-  if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+  uint8_t lfnst_idx = tree_type == UVG_CHROMA_T ? cur_cu->cr_lfnst_idx : cur_cu->lfnst_idx;
+  if(lfnst_idx) {
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
   }
-
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if (!false && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      COLOR_U,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      cur_cu->cr_lfnst_idx);
+    uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx, 0);
   }
   else if (state->encoder_control->cfg.rdoq_enable && false) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
       scan_order);
   }
   else {
-    uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->lfnst_idx);
+    uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, lfnst_idx);
   }
 
   int8_t has_coeffs = 0;
   {
     int i;
-    for (i = 0; i < width * width; ++i) {
+    for (i = 0; i < width * height; ++i) {
       if (coeff_out[i] != 0) {
         has_coeffs = 1;
         break;
@@ -342,13 +359,13 @@ int uvg_quant_cbcr_residual_generic(
   if (has_coeffs && !early_skip) {
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+    uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
-    if (cur_cu->cr_lfnst_idx) {
-      uvg_inv_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+    if (lfnst_idx) {
+      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
     }
     
-    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
     
 
     //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@@ -371,7 +388,7 @@ int uvg_quant_cbcr_residual_generic(
     //}
     const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    for (int y = 0; y < width; y++) {
+    for (int y = 0; y < height; y++) {
       for (int x = 0; x < width; x++) {
         if (temp == 2) {
           u_residual[x + y * width] = combined_residual[x + y * width];
@@ -400,7 +417,7 @@ int uvg_quant_cbcr_residual_generic(
         }
       }
     }
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
         int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
         u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val);
@@ -413,7 +430,7 @@ int uvg_quant_cbcr_residual_generic(
     // With no coeffs and rec_out == pred_int we skip copying the coefficients
     // because the reconstruction is just the prediction.
 
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
         u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
         v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
@@ -441,7 +458,7 @@ int uvg_quant_cbcr_residual_generic(
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@@ -454,19 +471,19 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   int has_coeffs = 0;
 
-  assert(width <= TR_MAX_WIDTH);
-  assert(width >= TR_MIN_WIDTH);
-
-  const int height = width; // TODO: height for non-square blocks
+  // With ISP these checks no longer apply, since width and height 2 is now possible
+  // With MTT even 1x16 and 16x1 ISP splits are possible
+  //assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
+  //assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
     int y, x;
     int sign, absval;
     int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         sign = residual[x + y * width] >= 0 ? 1 : -1;
         absval = sign * residual[x + y * width];
@@ -477,43 +494,54 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
   }
   else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
-  const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
+  const uint8_t lfnst_index = tree_type != UVG_CHROMA_T || color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
 
   if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
     // Forward low frequency non-separable transform
-    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
   }
   
 
   // Quantize coeffs. (coeff -> coeff_out)
   
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if (!use_trskip && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      color,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
   {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, color,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      lfnst_index);
+    uvg_rdoq(state, coeff, coeff_out, width, height, color,
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
   } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
       scan_order);
   } else {
   
-    uvg_quant(state, coeff, coeff_out, width, width, color,
+    uvg_quant(state, coeff, coeff_out, width, height, color,
       scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
   }
 
   // Check if there are any non-zero coefficients.
   {
     int i;
-    for (i = 0; i < width * width; ++i) {
+    for (i = 0; i < width * height; ++i) {
       if (coeff_out[i] != 0) {
         has_coeffs = 1;
         break;
@@ -527,25 +555,25 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     int y, x;
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, color,
+    uvg_dequant(state, coeff_out, coeff, width, height, color,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
     
     if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
       // Inverse low frequency non-separable transform
-      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
     }
     if (use_trskip) {
-      uvg_itransformskip(state->encoder_control, residual, coeff, width);
+      uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
     }
     else {
-      uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+      uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
     }
     
     if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
       int y, x;
       int sign, absval;
       int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-      for (y = 0; y < width; ++y) {
+      for (y = 0; y < height; ++y) {
         for (x = 0; x < width; ++x) {
           residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
           sign = residual[x + y * width] >= 0 ? 1 : -1;
@@ -561,7 +589,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
         rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
@@ -573,7 +601,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
     // because the reconstruction is just the prediction.
     int y, x;
 
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
       }
@@ -590,23 +618,29 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
+  if(encoder->cfg.dep_quant && !transform_skip) {
+    uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
+    return;
+  }
   int32_t shift,add,coeff_q;
   int32_t n;
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
 
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
 
   if (encoder->scaling_list.enable)
   {
-    uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
-    uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
     int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
 
-    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6];
+    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
     shift += 4;
 
     if (shift >qp_scaled / 6) {
@@ -624,10 +658,10 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
       }
     }
   } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
-    for (n = 0; n < width*height; n++) {
+    for (n = 0; n < width * height; n++) {
       coeff_q   = (q_coef[n] * scale + add) >> shift;
       coef[n] = (coeff_t)CLIP(-32768, 32767, coeff_q);
     }
@@ -651,14 +685,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
   weights[3] = (wts_packed >> 48) & 0xffff;
 }
 
-static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
   uint32_t sum = 0;
   uint16_t weights_unpacked[4];
 
   get_coeff_weights(weights, weights_unpacked);
 
-  for (int32_t i = 0; i < width * width; i++) {
+  for (int32_t i = 0; i < width * height; i++) {
      int16_t curr = coeff[i];
     uint32_t curr_abs = abs(curr);
     if (curr_abs > 3) {
diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h
index da2b05ae..665e0863 100644
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@@ -44,8 +44,6 @@
 #include "uvg266.h"
 #include "tables.h"
 
-#define QUANT_SHIFT 14
-
 int uvg_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
 void uvg_quant_generic(
   const encoder_state_t * const state,
@@ -60,7 +58,7 @@ void uvg_quant_generic(
   uint8_t lfnst_idx);
 
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@@ -71,6 +69,7 @@ int uvg_quant_cbcr_residual_generic(
   encoder_state_t* const state,
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in,
diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c
index 4ba2a37b..64b72eb9 100644
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0;
 dct_func * uvg_dct_8x8 = 0;
 dct_func * uvg_dct_16x16 = 0;
 dct_func * uvg_dct_32x32 = 0;
+dct_func * uvg_dct_non_square = 0;
 
 dct_func * uvg_fast_inverse_dst_4x4 = 0;
 
@@ -56,16 +57,19 @@ void(*uvg_mts_dct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
   int8_t width,
+  int8_t height,
   const int16_t *input,
   int16_t *output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
+
 void(*uvg_mts_idct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
   int8_t width,
+  int8_t height,
   const int16_t *input,
   int16_t *output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 
 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
@@ -90,8 +94,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
  *
  * \returns Pointer to the function.
  */
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
+    //return uvg_dct_non_square;
+  }
   switch (width) {
   case 4:
     //if (color == COLOR_Y && type == CU_INTRA) {
@@ -119,8 +128,13 @@ dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
  *
  * \returns Pointer to the function.
  */
-dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_idct function.");
+    //return uvg_idct_non_square;
+  }
   switch (width) {
   case 4:
     //if (color == COLOR_Y && type == CU_INTRA) {
diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h
index d58bf5a9..0ad3c8c4 100644
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4;
 extern dct_func * uvg_dct_8x8;
 extern dct_func * uvg_dct_16x16;
 extern dct_func * uvg_dct_32x32;
+extern dct_func * uvg_dct_non_square;
 
 extern dct_func * uvg_fast_inverse_dst_4x4;
 
@@ -64,9 +65,10 @@ typedef void (mts_dct_func)(
   color_t color,
   const cu_info_t* tu,
   int8_t width,
+  int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 extern mts_dct_func* uvg_mts_dct;
 
@@ -75,15 +77,16 @@ typedef void (mts_idct_func)(
   color_t color,
   const cu_info_t* tu,
   int8_t width,
+  int8_t height,
   const int16_t* input,
   int16_t* output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
 
 extern mts_idct_func* uvg_mts_idct;
 
 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type);
-dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
+dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
 
 
 
diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c
new file mode 100644
index 00000000..d0eac087
--- /dev/null
+++ b/src/strategies/strategies-depquant.c
@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/strategies-depquant.h"
+
+#include "strategies/avx2/depquant-avx2.h"
+#include "strategies/generic/depquant-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
+
+
+int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+  success &= uvg_strategy_register_depquant_generic(opaque, bitdepth);
+
+  if (uvg_g_hardware_flags.intel_flags.avx2) {
+    success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth);
+  }
+  return success;
+}
diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h
new file mode 100644
index 00000000..5a58a3c7
--- /dev/null
+++ b/src/strategies/strategies-depquant.h
@@ -0,0 +1,88 @@
+#ifndef STRATEGIES_DEPQUANT_H_
+#define STRATEGIES_DEPQUANT_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "dep_quant.h"
+
+
+// Declare function pointers.
+typedef int(dep_quant_decide_and_update_func)(
+  rate_estimator_t*                       re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma);
+
+typedef void (find_first_non_zero_coeff_func)(
+  const coeff_t*             srcCoeff,
+  const bool                 enableScalingLists,
+  const context_store* const dep_quant_context,
+  const uint32_t* const      scan,
+  const int32_t*             q_coeff,
+  int*                       firstTestPos,
+  int                        width,
+  int                        height);
+
+
+// Declare function pointers.
+extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
+
+int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_DEPQUANT_EXPORTS \
+  {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
+  {"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \
+
+
+
+#endif //STRATEGIES_DEPQUANT_H_
diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h
index 8743a6ed..969dfb57 100644
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@@ -49,7 +49,7 @@
 typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                          cabac_data_t * const cabac,
                                          const coeff_t *coeff,
-                                         uint8_t width,
+                                         const cu_loc_t * const loc,
                                          uint8_t color,
                                          int8_t scan_mode,
                                          cu_info_t* cur_cu,
diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h
index 0f7228a0..52f5e519 100644
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@@ -38,22 +38,26 @@
  * Interface for intra prediction functions.
  */
 
+#include "cu.h"
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 #include "uvg266.h"
 
 
 typedef void (angular_pred_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
   const int_fast8_t intra_mode,
   const int_fast8_t channel_type,
   const uvg_pixel *const in_ref_above,
   const uvg_pixel *const in_ref_left,
   uvg_pixel *const dst,
-  const uint8_t multi_ref_idx);
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode,
+  const int cu_dim);
 
 typedef void (intra_pred_planar_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
   const uvg_pixel *const ref_top,
   const uvg_pixel *const ref_left,
   uvg_pixel *const dst);
@@ -67,8 +71,8 @@ typedef void (intra_pred_filtered_dc_func)(
 
 typedef void (pdpc_planar_dc_func)(
   const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst);
 
diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c
index 00ad9ccb..649af2d6 100644
--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@@ -37,6 +37,7 @@
 #include "strategies/generic/picture-generic.h"
 #include "strategies/sse2/picture-sse2.h"
 #include "strategies/sse41/picture-sse41.h"
+#include "strategies/sse42/picture-sse42.h"
 #include "strategyselector.h"
 
 
@@ -70,6 +71,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0;
 cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0;
 
 cost_pixel_any_size_func * uvg_satd_any_size = 0;
+cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0;
 cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0;
 
 pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0;
@@ -115,103 +117,116 @@ int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) {
 /**
 * \brief  Get a function that calculates SATD for NxN block.
 *
-* \param n  Width of the region for which SATD is calculated.
+* \param width  Width of the region for which SATD is calculated.
 *
 * \returns  Pointer to cost_16bit_nxn_func.
 */
-cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n)
+cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_satd_4x4;
-  case 8:
-    return uvg_satd_8x8;
-  case 16:
-    return uvg_satd_16x16;
-  case 32:
-    return uvg_satd_32x32;
-  case 64:
-    return uvg_satd_64x64;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_satd_4x4;
+      case 8:
+        return uvg_satd_8x8;
+      case 16:
+        return uvg_satd_16x16;
+      case 32:
+        return uvg_satd_32x32;
+      case 64:
+        return uvg_satd_64x64;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 
 /**
 * \brief  Get a function that calculates SAD for NxN block.
 *
-* \param n  Width of the region for which SAD is calculated.
+* \param width  Width of the region for which SAD is calculated.
 *
 * \returns  Pointer to cost_16bit_nxn_func.
 */
-cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n)
+cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_sad_4x4;
-  case 8:
-    return uvg_sad_8x8;
-  case 16:
-    return uvg_sad_16x16;
-  case 32:
-    return uvg_sad_32x32;
-  case 64:
-    return uvg_sad_64x64;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_sad_4x4;
+      case 8:
+        return uvg_sad_8x8;
+      case 16:
+        return uvg_sad_16x16;
+      case 32:
+        return uvg_sad_32x32;
+      case 64:
+        return uvg_sad_64x64;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 /**
 * \brief  Get a function that calculates SATDs for 2 NxN blocks.
 *
-* \param n  Width of the region for which SATD is calculated.
+* \param width  Width of the region for which SATD is calculated.
+* \param height  Height of the region for which SATD is calculated.
 *
 * \returns  Pointer to cost_pixel_nxn_multi_func.
 */
-cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n)
+cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_satd_4x4_dual;
-  case 8:
-    return uvg_satd_8x8_dual;
-  case 16:
-    return uvg_satd_16x16_dual;
-  case 32:
-    return uvg_satd_32x32_dual;
-  case 64:
-    return uvg_satd_64x64_dual;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_satd_4x4_dual;
+      case 8:
+        return uvg_satd_8x8_dual;
+      case 16:
+        return uvg_satd_16x16_dual;
+      case 32:
+        return uvg_satd_32x32_dual;
+      case 64:
+        return uvg_satd_64x64_dual;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 
 /**
 * \brief  Get a function that calculates SADs for 2 NxN blocks.
 *
-* \param n  Width of the region for which SAD is calculated.
+* \param width  Width of the region for which SAD is calculated.
 *
 * \returns  Pointer to cost_pixel_nxn_multi_func.
 */
-cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n)
+cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height)
 {
-  switch (n) {
-  case 4:
-    return uvg_sad_4x4_dual;
-  case 8:
-    return uvg_sad_8x8_dual;
-  case 16:
-    return uvg_sad_16x16_dual;
-  case 32:
-    return uvg_sad_32x32_dual;
-  case 64:
-    return uvg_sad_64x64_dual;
-  default:
-    return NULL;
+  if(width == height) {
+    switch (width) {
+      case 4:
+        return uvg_sad_4x4_dual;
+      case 8:
+        return uvg_sad_8x8_dual;
+      case 16:
+        return uvg_sad_16x16_dual;
+      case 32:
+        return uvg_sad_32x32_dual;
+      case 64:
+        return uvg_sad_64x64_dual;
+      default:
+        return NULL;
+    }
   }
+  return NULL;
 }
 
 // Precomputed CRC32C lookup table for polynomial 0x04C11DB7
diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h
index 88f52cfc..cd4e2ec5 100644
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)(
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out);
 typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
 
-typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
+typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height);
 typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
 typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
                                 int32_t block_width, int32_t block_height,
@@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
 
 typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);
 
-typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
+typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride);
 
 
 extern const uint32_t uvg_crc_table[256];
@@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16;
 extern cost_pixel_nxn_func * uvg_satd_32x32;
 extern cost_pixel_nxn_func * uvg_satd_64x64;
 extern cost_pixel_any_size_func *uvg_satd_any_size;
+extern cost_pixel_any_size_func *uvg_satd_any_size_vtm;
 
 extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual;
 extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual;
@@ -203,8 +204,8 @@ extern pixel_var_func *uvg_pixel_var;
 extern generate_residual_func* uvg_generate_residual;
 
 int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth);
-cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n);
-cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
+cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height);
+cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height);
 
 #define STRATEGIES_PICTURE_EXPORTS \
   {"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \
@@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
   {"satd_32x32", (void**) &uvg_satd_32x32}, \
   {"satd_64x64", (void**) &uvg_satd_64x64}, \
   {"satd_any_size", (void**) &uvg_satd_any_size}, \
+  {"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \
   {"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \
   {"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \
   {"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \
diff --git a/src/strategies/strategies-quant.c b/src/strategies/strategies-quant.c
index 89baf86e..62c75d6f 100644
--- a/src/strategies/strategies-quant.c
+++ b/src/strategies/strategies-quant.c
@@ -38,15 +38,16 @@
 
 
 // Define function pointers.
-quant_func *uvg_quant;
-quant_cbcr_func *uvg_quant_cbcr_residual;
-quant_residual_func *uvg_quantize_residual;
-dequant_func *uvg_dequant;
-coeff_abs_sum_func *uvg_coeff_abs_sum;
+quant_func           *uvg_quant;
+quant_cbcr_func      *uvg_quant_cbcr_residual;
+quant_residual_func  *uvg_quantize_residual;
+dequant_func         *uvg_dequant;
+coeff_abs_sum_func   *uvg_coeff_abs_sum;
 fast_coeff_cost_func *uvg_fast_coeff_cost;
 
 
-int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) {
+int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth)
+{
   bool success = true;
 
   success &= uvg_strategy_register_quant_generic(opaque, bitdepth);
diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h
index a6c9a3d4..b0e75046 100644
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@@ -45,12 +45,23 @@
 #include "tables.h"
 
 // Declare function pointers.
-typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx);
+typedef unsigned (quant_func)(
+  const encoder_state_t * const state, 
+  coeff_t *coef, 
+  coeff_t *q_coef, 
+  int32_t width,
+  int32_t height, 
+  color_t color, 
+  int8_t scan_idx, 
+  int8_t block_type, 
+  int8_t transform_skip, 
+  uint8_t lfnst_idx);
+
 typedef unsigned (quant_cbcr_func)(
   encoder_state_t* const state,
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in,
@@ -63,16 +74,19 @@ typedef unsigned (quant_cbcr_func)(
   bool early_skip,
   int lmcs_chroma_adj, 
   enum uvg_tree_type tree_type);
+
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
   uvg_pixel *rec_out, coeff_t *coeff_out,
   bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type);
+
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
   int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
+
+typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
 
diff --git a/src/strategyselector.c b/src/strategyselector.c
index 477604a9..d6dffa4e 100644
--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
     fprintf(stderr, "uvg_strategy_register_encode failed!\n");
     return 0;
   }
+  if (!uvg_strategy_register_depquant(&strategies, bitdepth)) {
+    fprintf(stderr, "uvg_strategy_register_depquant failed!\n");
+    return 0;
+  }
   
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
diff --git a/src/strategyselector.h b/src/strategyselector.h
index caadfda9..8bbdfbed 100644
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st
 #include "strategies/strategies-intra.h"
 #include "strategies/strategies-sao.h"
 #include "strategies/strategies-encode.h"
+#include "strategies/strategies-depquant.h"
 #include "strategies/strategies-alf.h"
 
 static const strategy_to_select_t strategies_to_select[] = {
@@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = {
   STRATEGIES_SAO_EXPORTS
   STRATEGIES_ENCODE_EXPORTS
   STRATEGIES_ALF_EXPORTS
+  STRATEGIES_DEPQUANT_EXPORTS
   { NULL, NULL },
 };
 
diff --git a/src/tables.c b/src/tables.c
index 422fd714..c98ecf79 100644
--- a/src/tables.c
+++ b/src/tables.c
@@ -7,6 +7,8 @@
 #endif
 //                                                                  4              8                              16                                                             32                                                                                                                             64
 const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1] = {-1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4};
+//                                                     0   1   2      4              8                              16                                                             32                                                                                                                             64
+const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1] = { -1,  0,  1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 };
 
 const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2] =
 //===== luma/chroma =====
@@ -82,3 +84,2543 @@ const uint32_t* const uvg_g_sig_last_scan[3][5] = {
   {g_sig_last_scan_1_0, g_sig_last_scan_1_1, g_sig_last_scan_1_2, g_sig_last_scan_1_3, g_sig_last_scan_1_4},
   {g_sig_last_scan_2_0, g_sig_last_scan_2_1, g_sig_last_scan_2_2, g_sig_last_scan_2_3, g_sig_last_scan_2_4}
 };
+
+// Holds scan order indices for all possible block sizes for diagonal scan order and coefficient group scan order
+static const uint32_t const g_scan_order_buffer[32258] = {
+   0,    0,    1,    0,    1,    2,    3,    0,    1,    2,    3,    4,    5, // UNGROUPED 1xN, 1x2, 1x4, 1x8
+   6,    7,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10, // 1x16
+  11,   12,   13,   14,   15,    0,    1,    2,    3,    4,    5,    6,    7, // 1x32
+   8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
+  21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,    0,    1, // 1x64
+   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
+  15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
+  28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
+  41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,
+  54,   55,   56,   57,   58,   59,   60,   61,   62,   63,    0,    1,    0, // 2xN, 2x2
+   2,    1,    3,    0,    2,    1,    4,    3,    6,    5,    7,    0,    2, // 2x4, 2x8
+   1,    4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,
+  15,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9,   12, // 2x16
+  11,   14,   13,   16,   15,   18,   17,   20,   19,   22,   21,   24,   23,
+  26,   25,   28,   27,   30,   29,   31,    0,    2,    1,    4,    3,    6, // 2x32
+   5,    8,    7,   10,    9,   12,   11,   14,   13,   16,   15,   18,   17,
+  20,   19,   22,   21,   24,   23,   26,   25,   28,   27,   30,   29,   32,
+  31,   34,   33,   36,   35,   38,   37,   40,   39,   42,   41,   44,   43,
+  46,   45,   48,   47,   50,   49,   52,   51,   54,   53,   56,   55,   58,
+  57,   60,   59,   62,   61,   63,    0,    2,    1,    4,    3,    6,    5, // 2x64
+   8,    7,   10,    9,   12,   11,   14,   13,   16,   15,   18,   17,   20,
+  19,   22,   21,   24,   23,   26,   25,   28,   27,   30,   29,   32,   31,
+  34,   33,   36,   35,   38,   37,   40,   39,   42,   41,   44,   43,   46,
+  45,   48,   47,   50,   49,   52,   51,   54,   53,   56,   55,   58,   57,
+  60,   59,   62,   61,   64,   63,   66,   65,   68,   67,   70,   69,   72,
+  71,   74,   73,   76,   75,   78,   77,   80,   79,   82,   81,   84,   83,
+  86,   85,   88,   87,   90,   89,   92,   91,   94,   93,   96,   95,   98,
+  97,  100,   99,  102,  101,  104,  103,  106,  105,  108,  107,  110,  109,
+ 112,  111,  114,  113,  116,  115,  118,  117,  120,  119,  122,  121,  124,
+ 123,  126,  125,  127,    0,    1,    2,    3,    0,    4,    1,    5,    2, // 4xN, 4x2
+   6,    3,    7,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3, // 4x4
+  13,   10,    7,   14,   11,   15,    0,    4,    1,    8,    5,    2,   12, // 4x8
+   9,    6,    3,   16,   13,   10,    7,   20,   17,   14,   11,   24,   21,
+  18,   15,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,    0, // 4x16
+   4,    1,    8,    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,
+  20,   17,   14,   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,
+  29,   26,   23,   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,
+  38,   35,   48,   45,   42,   39,   52,   49,   46,   43,   56,   53,   50,
+  47,   60,   57,   54,   51,   61,   58,   55,   62,   59,   63,    0,    4, // 4x32
+   1,    8,    5,    2,   12,    9,    6,    3,   16,   13,   10,    7,   20,
+  17,   14,   11,   24,   21,   18,   15,   28,   25,   22,   19,   32,   29,
+  26,   23,   36,   33,   30,   27,   40,   37,   34,   31,   44,   41,   38,
+  35,   48,   45,   42,   39,   52,   49,   46,   43,   56,   53,   50,   47,
+  60,   57,   54,   51,   64,   61,   58,   55,   68,   65,   62,   59,   72,
+  69,   66,   63,   76,   73,   70,   67,   80,   77,   74,   71,   84,   81,
+  78,   75,   88,   85,   82,   79,   92,   89,   86,   83,   96,   93,   90,
+  87,  100,   97,   94,   91,  104,  101,   98,   95,  108,  105,  102,   99,
+ 112,  109,  106,  103,  116,  113,  110,  107,  120,  117,  114,  111,  124,
+ 121,  118,  115,  125,  122,  119,  126,  123,  127,    0,    4,    1,    8, // 4x64
+   5,    2,   12,    9,    6,    3,   16,   13,   10,    7,   20,   17,   14,
+  11,   24,   21,   18,   15,   28,   25,   22,   19,   32,   29,   26,   23,
+  36,   33,   30,   27,   40,   37,   34,   31,   44,   41,   38,   35,   48,
+  45,   42,   39,   52,   49,   46,   43,   56,   53,   50,   47,   60,   57,
+  54,   51,   64,   61,   58,   55,   68,   65,   62,   59,   72,   69,   66,
+  63,   76,   73,   70,   67,   80,   77,   74,   71,   84,   81,   78,   75,
+  88,   85,   82,   79,   92,   89,   86,   83,   96,   93,   90,   87,  100,
+  97,   94,   91,  104,  101,   98,   95,  108,  105,  102,   99,  112,  109,
+ 106,  103,  116,  113,  110,  107,  120,  117,  114,  111,  124,  121,  118,
+ 115,  128,  125,  122,  119,  132,  129,  126,  123,  136,  133,  130,  127,
+ 140,  137,  134,  131,  144,  141,  138,  135,  148,  145,  142,  139,  152,
+ 149,  146,  143,  156,  153,  150,  147,  160,  157,  154,  151,  164,  161,
+ 158,  155,  168,  165,  162,  159,  172,  169,  166,  163,  176,  173,  170,
+ 167,  180,  177,  174,  171,  184,  181,  178,  175,  188,  185,  182,  179,
+ 192,  189,  186,  183,  196,  193,  190,  187,  200,  197,  194,  191,  204,
+ 201,  198,  195,  208,  205,  202,  199,  212,  209,  206,  203,  216,  213,
+ 210,  207,  220,  217,  214,  211,  224,  221,  218,  215,  228,  225,  222,
+ 219,  232,  229,  226,  223,  236,  233,  230,  227,  240,  237,  234,  231,
+ 244,  241,  238,  235,  248,  245,  242,  239,  252,  249,  246,  243,  253,
+ 250,  247,  254,  251,  255,    0,    1,    2,    3,    4,    5,    6,    7, // 8xN
+   0,    8,    1,    9,    2,   10,    3,   11,    4,   12,    5,   13,    6, // 8x2
+  14,    7,   15,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3, // 8x4
+  25,   18,   11,    4,   26,   19,   12,    5,   27,   20,   13,    6,   28,
+  21,   14,    7,   29,   22,   15,   30,   23,   31,    0,    8,    1,   16, // 8x8
+   9,    2,   24,   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,
+  26,   19,   12,    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,
+  42,   35,   28,   21,   14,    7,   57,   50,   43,   36,   29,   22,   15,
+  58,   51,   44,   37,   30,   23,   59,   52,   45,   38,   31,   60,   53,
+  46,   39,   61,   54,   47,   62,   55,   63,    0,    8,    1,   16,    9, // 8x16
+   2,   24,   17,   10,    3,   32,   25,   18,   11,    4,   40,   33,   26,
+  19,   12,    5,   48,   41,   34,   27,   20,   13,    6,   56,   49,   42,
+  35,   28,   21,   14,    7,   64,   57,   50,   43,   36,   29,   22,   15,
+  72,   65,   58,   51,   44,   37,   30,   23,   80,   73,   66,   59,   52,
+  45,   38,   31,   88,   81,   74,   67,   60,   53,   46,   39,   96,   89,
+  82,   75,   68,   61,   54,   47,  104,   97,   90,   83,   76,   69,   62,
+  55,  112,  105,   98,   91,   84,   77,   70,   63,  120,  113,  106,   99,
+  92,   85,   78,   71,  121,  114,  107,  100,   93,   86,   79,  122,  115,
+ 108,  101,   94,   87,  123,  116,  109,  102,   95,  124,  117,  110,  103,
+ 125,  118,  111,  126,  119,  127,    0,    8,    1,   16,    9,    2,   24, // 8x32
+  17,   10,    3,   32,   25,   18,   11,    4,   40,   33,   26,   19,   12,
+   5,   48,   41,   34,   27,   20,   13,    6,   56,   49,   42,   35,   28,
+  21,   14,    7,   64,   57,   50,   43,   36,   29,   22,   15,   72,   65,
+  58,   51,   44,   37,   30,   23,   80,   73,   66,   59,   52,   45,   38,
+  31,   88,   81,   74,   67,   60,   53,   46,   39,   96,   89,   82,   75,
+  68,   61,   54,   47,  104,   97,   90,   83,   76,   69,   62,   55,  112,
+ 105,   98,   91,   84,   77,   70,   63,  120,  113,  106,   99,   92,   85,
+  78,   71,  128,  121,  114,  107,  100,   93,   86,   79,  136,  129,  122,
+ 115,  108,  101,   94,   87,  144,  137,  130,  123,  116,  109,  102,   95,
+ 152,  145,  138,  131,  124,  117,  110,  103,  160,  153,  146,  139,  132,
+ 125,  118,  111,  168,  161,  154,  147,  140,  133,  126,  119,  176,  169,
+ 162,  155,  148,  141,  134,  127,  184,  177,  170,  163,  156,  149,  142,
+ 135,  192,  185,  178,  171,  164,  157,  150,  143,  200,  193,  186,  179,
+ 172,  165,  158,  151,  208,  201,  194,  187,  180,  173,  166,  159,  216,
+ 209,  202,  195,  188,  181,  174,  167,  224,  217,  210,  203,  196,  189,
+ 182,  175,  232,  225,  218,  211,  204,  197,  190,  183,  240,  233,  226,
+ 219,  212,  205,  198,  191,  248,  241,  234,  227,  220,  213,  206,  199,
+ 249,  242,  235,  228,  221,  214,  207,  250,  243,  236,  229,  222,  215,
+ 251,  244,  237,  230,  223,  252,  245,  238,  231,  253,  246,  239,  254,
+ 247,  255,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   32, // 8x64
+  25,   18,   11,    4,   40,   33,   26,   19,   12,    5,   48,   41,   34,
+  27,   20,   13,    6,   56,   49,   42,   35,   28,   21,   14,    7,   64,
+  57,   50,   43,   36,   29,   22,   15,   72,   65,   58,   51,   44,   37,
+  30,   23,   80,   73,   66,   59,   52,   45,   38,   31,   88,   81,   74,
+  67,   60,   53,   46,   39,   96,   89,   82,   75,   68,   61,   54,   47,
+ 104,   97,   90,   83,   76,   69,   62,   55,  112,  105,   98,   91,   84,
+  77,   70,   63,  120,  113,  106,   99,   92,   85,   78,   71,  128,  121,
+ 114,  107,  100,   93,   86,   79,  136,  129,  122,  115,  108,  101,   94,
+  87,  144,  137,  130,  123,  116,  109,  102,   95,  152,  145,  138,  131,
+ 124,  117,  110,  103,  160,  153,  146,  139,  132,  125,  118,  111,  168,
+ 161,  154,  147,  140,  133,  126,  119,  176,  169,  162,  155,  148,  141,
+ 134,  127,  184,  177,  170,  163,  156,  149,  142,  135,  192,  185,  178,
+ 171,  164,  157,  150,  143,  200,  193,  186,  179,  172,  165,  158,  151,
+ 208,  201,  194,  187,  180,  173,  166,  159,  216,  209,  202,  195,  188,
+ 181,  174,  167,  224,  217,  210,  203,  196,  189,  182,  175,  232,  225,
+ 218,  211,  204,  197,  190,  183,  240,  233,  226,  219,  212,  205,  198,
+ 191,  248,  241,  234,  227,  220,  213,  206,  199,  256,  249,  242,  235,
+ 228,  221,  214,  207,  264,  257,  250,  243,  236,  229,  222,  215,  272,
+ 265,  258,  251,  244,  237,  230,  223,  280,  273,  266,  259,  252,  245,
+ 238,  231,  288,  281,  274,  267,  260,  253,  246,  239,  296,  289,  282,
+ 275,  268,  261,  254,  247,  304,  297,  290,  283,  276,  269,  262,  255,
+ 312,  305,  298,  291,  284,  277,  270,  263,  320,  313,  306,  299,  292,
+ 285,  278,  271,  328,  321,  314,  307,  300,  293,  286,  279,  336,  329,
+ 322,  315,  308,  301,  294,  287,  344,  337,  330,  323,  316,  309,  302,
+ 295,  352,  345,  338,  331,  324,  317,  310,  303,  360,  353,  346,  339,
+ 332,  325,  318,  311,  368,  361,  354,  347,  340,  333,  326,  319,  376,
+ 369,  362,  355,  348,  341,  334,  327,  384,  377,  370,  363,  356,  349,
+ 342,  335,  392,  385,  378,  371,  364,  357,  350,  343,  400,  393,  386,
+ 379,  372,  365,  358,  351,  408,  401,  394,  387,  380,  373,  366,  359,
+ 416,  409,  402,  395,  388,  381,  374,  367,  424,  417,  410,  403,  396,
+ 389,  382,  375,  432,  425,  418,  411,  404,  397,  390,  383,  440,  433,
+ 426,  419,  412,  405,  398,  391,  448,  441,  434,  427,  420,  413,  406,
+ 399,  456,  449,  442,  435,  428,  421,  414,  407,  464,  457,  450,  443,
+ 436,  429,  422,  415,  472,  465,  458,  451,  444,  437,  430,  423,  480,
+ 473,  466,  459,  452,  445,  438,  431,  488,  481,  474,  467,  460,  453,
+ 446,  439,  496,  489,  482,  475,  468,  461,  454,  447,  504,  497,  490,
+ 483,  476,  469,  462,  455,  505,  498,  491,  484,  477,  470,  463,  506,
+ 499,  492,  485,  478,  471,  507,  500,  493,  486,  479,  508,  501,  494,
+ 487,  509,  502,  495,  510,  503,  511,    0,    1,    2,    3,    4,    5, // 16xN
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,    0,   16,    1, // 16x2
+  17,    2,   18,    3,   19,    4,   20,    5,   21,    6,   22,    7,   23,
+   8,   24,    9,   25,   10,   26,   11,   27,   12,   28,   13,   29,   14,
+  30,   15,   31,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3, // 16x4
+  49,   34,   19,    4,   50,   35,   20,    5,   51,   36,   21,    6,   52,
+  37,   22,    7,   53,   38,   23,    8,   54,   39,   24,    9,   55,   40,
+  25,   10,   56,   41,   26,   11,   57,   42,   27,   12,   58,   43,   28,
+  13,   59,   44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,
+  47,   63,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64, // 16x8
+  49,   34,   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,
+  51,   36,   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  113,
+  98,   83,   68,   53,   38,   23,    8,  114,   99,   84,   69,   54,   39,
+  24,    9,  115,  100,   85,   70,   55,   40,   25,   10,  116,  101,   86,
+  71,   56,   41,   26,   11,  117,  102,   87,   72,   57,   42,   27,   12,
+ 118,  103,   88,   73,   58,   43,   28,   13,  119,  104,   89,   74,   59,
+  44,   29,   14,  120,  105,   90,   75,   60,   45,   30,   15,  121,  106,
+  91,   76,   61,   46,   31,  122,  107,   92,   77,   62,   47,  123,  108,
+  93,   78,   63,  124,  109,   94,   79,  125,  110,   95,  126,  111,  127,
+   0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49,   34, // 16x16
+  19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,   51,   36,
+  21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  128,  113,   98,
+  83,   68,   53,   38,   23,    8,  144,  129,  114,   99,   84,   69,   54,
+  39,   24,    9,  160,  145,  130,  115,  100,   85,   70,   55,   40,   25,
+  10,  176,  161,  146,  131,  116,  101,   86,   71,   56,   41,   26,   11,
+ 192,  177,  162,  147,  132,  117,  102,   87,   72,   57,   42,   27,   12,
+ 208,  193,  178,  163,  148,  133,  118,  103,   88,   73,   58,   43,   28,
+  13,  224,  209,  194,  179,  164,  149,  134,  119,  104,   89,   74,   59,
+  44,   29,   14,  240,  225,  210,  195,  180,  165,  150,  135,  120,  105,
+  90,   75,   60,   45,   30,   15,  241,  226,  211,  196,  181,  166,  151,
+ 136,  121,  106,   91,   76,   61,   46,   31,  242,  227,  212,  197,  182,
+ 167,  152,  137,  122,  107,   92,   77,   62,   47,  243,  228,  213,  198,
+ 183,  168,  153,  138,  123,  108,   93,   78,   63,  244,  229,  214,  199,
+ 184,  169,  154,  139,  124,  109,   94,   79,  245,  230,  215,  200,  185,
+ 170,  155,  140,  125,  110,   95,  246,  231,  216,  201,  186,  171,  156,
+ 141,  126,  111,  247,  232,  217,  202,  187,  172,  157,  142,  127,  248,
+ 233,  218,  203,  188,  173,  158,  143,  249,  234,  219,  204,  189,  174,
+ 159,  250,  235,  220,  205,  190,  175,  251,  236,  221,  206,  191,  252,
+ 237,  222,  207,  253,  238,  223,  254,  239,  255,    0,   16,    1,   32, // 16x32
+  17,    2,   48,   33,   18,    3,   64,   49,   34,   19,    4,   80,   65,
+  50,   35,   20,    5,   96,   81,   66,   51,   36,   21,    6,  112,   97,
+  82,   67,   52,   37,   22,    7,  128,  113,   98,   83,   68,   53,   38,
+  23,    8,  144,  129,  114,   99,   84,   69,   54,   39,   24,    9,  160,
+ 145,  130,  115,  100,   85,   70,   55,   40,   25,   10,  176,  161,  146,
+ 131,  116,  101,   86,   71,   56,   41,   26,   11,  192,  177,  162,  147,
+ 132,  117,  102,   87,   72,   57,   42,   27,   12,  208,  193,  178,  163,
+ 148,  133,  118,  103,   88,   73,   58,   43,   28,   13,  224,  209,  194,
+ 179,  164,  149,  134,  119,  104,   89,   74,   59,   44,   29,   14,  240,
+ 225,  210,  195,  180,  165,  150,  135,  120,  105,   90,   75,   60,   45,
+  30,   15,  256,  241,  226,  211,  196,  181,  166,  151,  136,  121,  106,
+  91,   76,   61,   46,   31,  272,  257,  242,  227,  212,  197,  182,  167,
+ 152,  137,  122,  107,   92,   77,   62,   47,  288,  273,  258,  243,  228,
+ 213,  198,  183,  168,  153,  138,  123,  108,   93,   78,   63,  304,  289,
+ 274,  259,  244,  229,  214,  199,  184,  169,  154,  139,  124,  109,   94,
+  79,  320,  305,  290,  275,  260,  245,  230,  215,  200,  185,  170,  155,
+ 140,  125,  110,   95,  336,  321,  306,  291,  276,  261,  246,  231,  216,
+ 201,  186,  171,  156,  141,  126,  111,  352,  337,  322,  307,  292,  277,
+ 262,  247,  232,  217,  202,  187,  172,  157,  142,  127,  368,  353,  338,
+ 323,  308,  293,  278,  263,  248,  233,  218,  203,  188,  173,  158,  143,
+ 384,  369,  354,  339,  324,  309,  294,  279,  264,  249,  234,  219,  204,
+ 189,  174,  159,  400,  385,  370,  355,  340,  325,  310,  295,  280,  265,
+ 250,  235,  220,  205,  190,  175,  416,  401,  386,  371,  356,  341,  326,
+ 311,  296,  281,  266,  251,  236,  221,  206,  191,  432,  417,  402,  387,
+ 372,  357,  342,  327,  312,  297,  282,  267,  252,  237,  222,  207,  448,
+ 433,  418,  403,  388,  373,  358,  343,  328,  313,  298,  283,  268,  253,
+ 238,  223,  464,  449,  434,  419,  404,  389,  374,  359,  344,  329,  314,
+ 299,  284,  269,  254,  239,  480,  465,  450,  435,  420,  405,  390,  375,
+ 360,  345,  330,  315,  300,  285,  270,  255,  496,  481,  466,  451,  436,
+ 421,  406,  391,  376,  361,  346,  331,  316,  301,  286,  271,  497,  482,
+ 467,  452,  437,  422,  407,  392,  377,  362,  347,  332,  317,  302,  287,
+ 498,  483,  468,  453,  438,  423,  408,  393,  378,  363,  348,  333,  318,
+ 303,  499,  484,  469,  454,  439,  424,  409,  394,  379,  364,  349,  334,
+ 319,  500,  485,  470,  455,  440,  425,  410,  395,  380,  365,  350,  335,
+ 501,  486,  471,  456,  441,  426,  411,  396,  381,  366,  351,  502,  487,
+ 472,  457,  442,  427,  412,  397,  382,  367,  503,  488,  473,  458,  443,
+ 428,  413,  398,  383,  504,  489,  474,  459,  444,  429,  414,  399,  505,
+ 490,  475,  460,  445,  430,  415,  506,  491,  476,  461,  446,  431,  507,
+ 492,  477,  462,  447,  508,  493,  478,  463,  509,  494,  479,  510,  495,
+ 511,    0,   16,    1,   32,   17,    2,   48,   33,   18,    3,   64,   49, // 16x64
+  34,   19,    4,   80,   65,   50,   35,   20,    5,   96,   81,   66,   51,
+  36,   21,    6,  112,   97,   82,   67,   52,   37,   22,    7,  128,  113,
+  98,   83,   68,   53,   38,   23,    8,  144,  129,  114,   99,   84,   69,
+  54,   39,   24,    9,  160,  145,  130,  115,  100,   85,   70,   55,   40,
+  25,   10,  176,  161,  146,  131,  116,  101,   86,   71,   56,   41,   26,
+  11,  192,  177,  162,  147,  132,  117,  102,   87,   72,   57,   42,   27,
+  12,  208,  193,  178,  163,  148,  133,  118,  103,   88,   73,   58,   43,
+  28,   13,  224,  209,  194,  179,  164,  149,  134,  119,  104,   89,   74,
+  59,   44,   29,   14,  240,  225,  210,  195,  180,  165,  150,  135,  120,
+ 105,   90,   75,   60,   45,   30,   15,  256,  241,  226,  211,  196,  181,
+ 166,  151,  136,  121,  106,   91,   76,   61,   46,   31,  272,  257,  242,
+ 227,  212,  197,  182,  167,  152,  137,  122,  107,   92,   77,   62,   47,
+ 288,  273,  258,  243,  228,  213,  198,  183,  168,  153,  138,  123,  108,
+  93,   78,   63,  304,  289,  274,  259,  244,  229,  214,  199,  184,  169,
+ 154,  139,  124,  109,   94,   79,  320,  305,  290,  275,  260,  245,  230,
+ 215,  200,  185,  170,  155,  140,  125,  110,   95,  336,  321,  306,  291,
+ 276,  261,  246,  231,  216,  201,  186,  171,  156,  141,  126,  111,  352,
+ 337,  322,  307,  292,  277,  262,  247,  232,  217,  202,  187,  172,  157,
+ 142,  127,  368,  353,  338,  323,  308,  293,  278,  263,  248,  233,  218,
+ 203,  188,  173,  158,  143,  384,  369,  354,  339,  324,  309,  294,  279,
+ 264,  249,  234,  219,  204,  189,  174,  159,  400,  385,  370,  355,  340,
+ 325,  310,  295,  280,  265,  250,  235,  220,  205,  190,  175,  416,  401,
+ 386,  371,  356,  341,  326,  311,  296,  281,  266,  251,  236,  221,  206,
+ 191,  432,  417,  402,  387,  372,  357,  342,  327,  312,  297,  282,  267,
+ 252,  237,  222,  207,  448,  433,  418,  403,  388,  373,  358,  343,  328,
+ 313,  298,  283,  268,  253,  238,  223,  464,  449,  434,  419,  404,  389,
+ 374,  359,  344,  329,  314,  299,  284,  269,  254,  239,  480,  465,  450,
+ 435,  420,  405,  390,  375,  360,  345,  330,  315,  300,  285,  270,  255,
+ 496,  481,  466,  451,  436,  421,  406,  391,  376,  361,  346,  331,  316,
+ 301,  286,  271,  512,  497,  482,  467,  452,  437,  422,  407,  392,  377,
+ 362,  347,  332,  317,  302,  287,  528,  513,  498,  483,  468,  453,  438,
+ 423,  408,  393,  378,  363,  348,  333,  318,  303,  544,  529,  514,  499,
+ 484,  469,  454,  439,  424,  409,  394,  379,  364,  349,  334,  319,  560,
+ 545,  530,  515,  500,  485,  470,  455,  440,  425,  410,  395,  380,  365,
+ 350,  335,  576,  561,  546,  531,  516,  501,  486,  471,  456,  441,  426,
+ 411,  396,  381,  366,  351,  592,  577,  562,  547,  532,  517,  502,  487,
+ 472,  457,  442,  427,  412,  397,  382,  367,  608,  593,  578,  563,  548,
+ 533,  518,  503,  488,  473,  458,  443,  428,  413,  398,  383,  624,  609,
+ 594,  579,  564,  549,  534,  519,  504,  489,  474,  459,  444,  429,  414,
+ 399,  640,  625,  610,  595,  580,  565,  550,  535,  520,  505,  490,  475,
+ 460,  445,  430,  415,  656,  641,  626,  611,  596,  581,  566,  551,  536,
+ 521,  506,  491,  476,  461,  446,  431,  672,  657,  642,  627,  612,  597,
+ 582,  567,  552,  537,  522,  507,  492,  477,  462,  447,  688,  673,  658,
+ 643,  628,  613,  598,  583,  568,  553,  538,  523,  508,  493,  478,  463,
+ 704,  689,  674,  659,  644,  629,  614,  599,  584,  569,  554,  539,  524,
+ 509,  494,  479,  720,  705,  690,  675,  660,  645,  630,  615,  600,  585,
+ 570,  555,  540,  525,  510,  495,  736,  721,  706,  691,  676,  661,  646,
+ 631,  616,  601,  586,  571,  556,  541,  526,  511,  752,  737,  722,  707,
+ 692,  677,  662,  647,  632,  617,  602,  587,  572,  557,  542,  527,  768,
+ 753,  738,  723,  708,  693,  678,  663,  648,  633,  618,  603,  588,  573,
+ 558,  543,  784,  769,  754,  739,  724,  709,  694,  679,  664,  649,  634,
+ 619,  604,  589,  574,  559,  800,  785,  770,  755,  740,  725,  710,  695,
+ 680,  665,  650,  635,  620,  605,  590,  575,  816,  801,  786,  771,  756,
+ 741,  726,  711,  696,  681,  666,  651,  636,  621,  606,  591,  832,  817,
+ 802,  787,  772,  757,  742,  727,  712,  697,  682,  667,  652,  637,  622,
+ 607,  848,  833,  818,  803,  788,  773,  758,  743,  728,  713,  698,  683,
+ 668,  653,  638,  623,  864,  849,  834,  819,  804,  789,  774,  759,  744,
+ 729,  714,  699,  684,  669,  654,  639,  880,  865,  850,  835,  820,  805,
+ 790,  775,  760,  745,  730,  715,  700,  685,  670,  655,  896,  881,  866,
+ 851,  836,  821,  806,  791,  776,  761,  746,  731,  716,  701,  686,  671,
+ 912,  897,  882,  867,  852,  837,  822,  807,  792,  777,  762,  747,  732,
+ 717,  702,  687,  928,  913,  898,  883,  868,  853,  838,  823,  808,  793,
+ 778,  763,  748,  733,  718,  703,  944,  929,  914,  899,  884,  869,  854,
+ 839,  824,  809,  794,  779,  764,  749,  734,  719,  960,  945,  930,  915,
+ 900,  885,  870,  855,  840,  825,  810,  795,  780,  765,  750,  735,  976,
+ 961,  946,  931,  916,  901,  886,  871,  856,  841,  826,  811,  796,  781,
+ 766,  751,  992,  977,  962,  947,  932,  917,  902,  887,  872,  857,  842,
+ 827,  812,  797,  782,  767, 1008,  993,  978,  963,  948,  933,  918,  903,
+ 888,  873,  858,  843,  828,  813,  798,  783, 1009,  994,  979,  964,  949,
+ 934,  919,  904,  889,  874,  859,  844,  829,  814,  799, 1010,  995,  980,
+ 965,  950,  935,  920,  905,  890,  875,  860,  845,  830,  815, 1011,  996,
+ 981,  966,  951,  936,  921,  906,  891,  876,  861,  846,  831, 1012,  997,
+ 982,  967,  952,  937,  922,  907,  892,  877,  862,  847, 1013,  998,  983,
+ 968,  953,  938,  923,  908,  893,  878,  863, 1014,  999,  984,  969,  954,
+ 939,  924,  909,  894,  879, 1015, 1000,  985,  970,  955,  940,  925,  910,
+ 895, 1016, 1001,  986,  971,  956,  941,  926,  911, 1017, 1002,  987,  972,
+ 957,  942,  927, 1018, 1003,  988,  973,  958,  943, 1019, 1004,  989,  974,
+ 959, 1020, 1005,  990,  975, 1021, 1006,  991, 1022, 1007, 1023,    0,    1, // 32xN
+   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
+  15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,
+  28,   29,   30,   31,    0,   32,    1,   33,    2,   34,    3,   35,    4, // 32x2
+  36,    5,   37,    6,   38,    7,   39,    8,   40,    9,   41,   10,   42,
+  11,   43,   12,   44,   13,   45,   14,   46,   15,   47,   16,   48,   17,
+  49,   18,   50,   19,   51,   20,   52,   21,   53,   22,   54,   23,   55,
+  24,   56,   25,   57,   26,   58,   27,   59,   28,   60,   29,   61,   30,
+  62,   31,   63,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3, // 32x4
+  97,   66,   35,    4,   98,   67,   36,    5,   99,   68,   37,    6,  100,
+  69,   38,    7,  101,   70,   39,    8,  102,   71,   40,    9,  103,   72,
+  41,   10,  104,   73,   42,   11,  105,   74,   43,   12,  106,   75,   44,
+  13,  107,   76,   45,   14,  108,   77,   46,   15,  109,   78,   47,   16,
+ 110,   79,   48,   17,  111,   80,   49,   18,  112,   81,   50,   19,  113,
+  82,   51,   20,  114,   83,   52,   21,  115,   84,   53,   22,  116,   85,
+  54,   23,  117,   86,   55,   24,  118,   87,   56,   25,  119,   88,   57,
+  26,  120,   89,   58,   27,  121,   90,   59,   28,  122,   91,   60,   29,
+ 123,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,  126,   95,
+ 127,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97, // 32x8
+  66,   35,    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,   99,
+  68,   37,    6,  224,  193,  162,  131,  100,   69,   38,    7,  225,  194,
+ 163,  132,  101,   70,   39,    8,  226,  195,  164,  133,  102,   71,   40,
+   9,  227,  196,  165,  134,  103,   72,   41,   10,  228,  197,  166,  135,
+ 104,   73,   42,   11,  229,  198,  167,  136,  105,   74,   43,   12,  230,
+ 199,  168,  137,  106,   75,   44,   13,  231,  200,  169,  138,  107,   76,
+  45,   14,  232,  201,  170,  139,  108,   77,   46,   15,  233,  202,  171,
+ 140,  109,   78,   47,   16,  234,  203,  172,  141,  110,   79,   48,   17,
+ 235,  204,  173,  142,  111,   80,   49,   18,  236,  205,  174,  143,  112,
+  81,   50,   19,  237,  206,  175,  144,  113,   82,   51,   20,  238,  207,
+ 176,  145,  114,   83,   52,   21,  239,  208,  177,  146,  115,   84,   53,
+  22,  240,  209,  178,  147,  116,   85,   54,   23,  241,  210,  179,  148,
+ 117,   86,   55,   24,  242,  211,  180,  149,  118,   87,   56,   25,  243,
+ 212,  181,  150,  119,   88,   57,   26,  244,  213,  182,  151,  120,   89,
+  58,   27,  245,  214,  183,  152,  121,   90,   59,   28,  246,  215,  184,
+ 153,  122,   91,   60,   29,  247,  216,  185,  154,  123,   92,   61,   30,
+ 248,  217,  186,  155,  124,   93,   62,   31,  249,  218,  187,  156,  125,
+  94,   63,  250,  219,  188,  157,  126,   95,  251,  220,  189,  158,  127,
+ 252,  221,  190,  159,  253,  222,  191,  254,  223,  255,    0,   32,    1, // 32x16
+  64,   33,    2,   96,   65,   34,    3,  128,   97,   66,   35,    4,  160,
+ 129,   98,   67,   36,    5,  192,  161,  130,   99,   68,   37,    6,  224,
+ 193,  162,  131,  100,   69,   38,    7,  256,  225,  194,  163,  132,  101,
+  70,   39,    8,  288,  257,  226,  195,  164,  133,  102,   71,   40,    9,
+ 320,  289,  258,  227,  196,  165,  134,  103,   72,   41,   10,  352,  321,
+ 290,  259,  228,  197,  166,  135,  104,   73,   42,   11,  384,  353,  322,
+ 291,  260,  229,  198,  167,  136,  105,   74,   43,   12,  416,  385,  354,
+ 323,  292,  261,  230,  199,  168,  137,  106,   75,   44,   13,  448,  417,
+ 386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,
+ 480,  449,  418,  387,  356,  325,  294,  263,  232,  201,  170,  139,  108,
+  77,   46,   15,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,
+ 171,  140,  109,   78,   47,   16,  482,  451,  420,  389,  358,  327,  296,
+ 265,  234,  203,  172,  141,  110,   79,   48,   17,  483,  452,  421,  390,
+ 359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,  484,
+ 453,  422,  391,  360,  329,  298,  267,  236,  205,  174,  143,  112,   81,
+  50,   19,  485,  454,  423,  392,  361,  330,  299,  268,  237,  206,  175,
+ 144,  113,   82,   51,   20,  486,  455,  424,  393,  362,  331,  300,  269,
+ 238,  207,  176,  145,  114,   83,   52,   21,  487,  456,  425,  394,  363,
+ 332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,  488,  457,
+ 426,  395,  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,
+  23,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,  148,
+ 117,   86,   55,   24,  490,  459,  428,  397,  366,  335,  304,  273,  242,
+ 211,  180,  149,  118,   87,   56,   25,  491,  460,  429,  398,  367,  336,
+ 305,  274,  243,  212,  181,  150,  119,   88,   57,   26,  492,  461,  430,
+ 399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,
+ 493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,
+  90,   59,   28,  494,  463,  432,  401,  370,  339,  308,  277,  246,  215,
+ 184,  153,  122,   91,   60,   29,  495,  464,  433,  402,  371,  340,  309,
+ 278,  247,  216,  185,  154,  123,   92,   61,   30,  496,  465,  434,  403,
+ 372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  497,
+ 466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,
+  63,  498,  467,  436,  405,  374,  343,  312,  281,  250,  219,  188,  157,
+ 126,   95,  499,  468,  437,  406,  375,  344,  313,  282,  251,  220,  189,
+ 158,  127,  500,  469,  438,  407,  376,  345,  314,  283,  252,  221,  190,
+ 159,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,  191,  502,
+ 471,  440,  409,  378,  347,  316,  285,  254,  223,  503,  472,  441,  410,
+ 379,  348,  317,  286,  255,  504,  473,  442,  411,  380,  349,  318,  287,
+ 505,  474,  443,  412,  381,  350,  319,  506,  475,  444,  413,  382,  351,
+ 507,  476,  445,  414,  383,  508,  477,  446,  415,  509,  478,  447,  510,
+ 479,  511,    0,   32,    1,   64,   33,    2,   96,   65,   34,    3,  128, // 32x32
+  97,   66,   35,    4,  160,  129,   98,   67,   36,    5,  192,  161,  130,
+  99,   68,   37,    6,  224,  193,  162,  131,  100,   69,   38,    7,  256,
+ 225,  194,  163,  132,  101,   70,   39,    8,  288,  257,  226,  195,  164,
+ 133,  102,   71,   40,    9,  320,  289,  258,  227,  196,  165,  134,  103,
+  72,   41,   10,  352,  321,  290,  259,  228,  197,  166,  135,  104,   73,
+  42,   11,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,
+  43,   12,  416,  385,  354,  323,  292,  261,  230,  199,  168,  137,  106,
+  75,   44,   13,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,
+ 138,  107,   76,   45,   14,  480,  449,  418,  387,  356,  325,  294,  263,
+ 232,  201,  170,  139,  108,   77,   46,   15,  512,  481,  450,  419,  388,
+ 357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,  544,
+ 513,  482,  451,  420,  389,  358,  327,  296,  265,  234,  203,  172,  141,
+ 110,   79,   48,   17,  576,  545,  514,  483,  452,  421,  390,  359,  328,
+ 297,  266,  235,  204,  173,  142,  111,   80,   49,   18,  608,  577,  546,
+ 515,  484,  453,  422,  391,  360,  329,  298,  267,  236,  205,  174,  143,
+ 112,   81,   50,   19,  640,  609,  578,  547,  516,  485,  454,  423,  392,
+ 361,  330,  299,  268,  237,  206,  175,  144,  113,   82,   51,   20,  672,
+ 641,  610,  579,  548,  517,  486,  455,  424,  393,  362,  331,  300,  269,
+ 238,  207,  176,  145,  114,   83,   52,   21,  704,  673,  642,  611,  580,
+ 549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,
+ 146,  115,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,
+ 488,  457,  426,  395,  364,  333,  302,  271,  240,  209,  178,  147,  116,
+  85,   54,   23,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,
+ 458,  427,  396,  365,  334,  303,  272,  241,  210,  179,  148,  117,   86,
+  55,   24,  800,  769,  738,  707,  676,  645,  614,  583,  552,  521,  490,
+ 459,  428,  397,  366,  335,  304,  273,  242,  211,  180,  149,  118,   87,
+  56,   25,  832,  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,
+ 491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,
+  88,   57,   26,  864,  833,  802,  771,  740,  709,  678,  647,  616,  585,
+ 554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,
+ 151,  120,   89,   58,   27,  896,  865,  834,  803,  772,  741,  710,  679,
+ 648,  617,  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,
+ 245,  214,  183,  152,  121,   90,   59,   28,  928,  897,  866,  835,  804,
+ 773,  742,  711,  680,  649,  618,  587,  556,  525,  494,  463,  432,  401,
+ 370,  339,  308,  277,  246,  215,  184,  153,  122,   91,   60,   29,  960,
+ 929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,
+ 526,  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,
+ 123,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,
+ 713,  682,  651,  620,  589,  558,  527,  496,  465,  434,  403,  372,  341,
+ 310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,
+ 900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,
+ 497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,
+  94,   63,  994,  963,  932,  901,  870,  839,  808,  777,  746,  715,  684,
+ 653,  622,  591,  560,  529,  498,  467,  436,  405,  374,  343,  312,  281,
+ 250,  219,  188,  157,  126,   95,  995,  964,  933,  902,  871,  840,  809,
+ 778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,  406,
+ 375,  344,  313,  282,  251,  220,  189,  158,  127,  996,  965,  934,  903,
+ 872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,
+ 469,  438,  407,  376,  345,  314,  283,  252,  221,  190,  159,  997,  966,
+ 935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,
+ 532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,  191,  998,
+ 967,  936,  905,  874,  843,  812,  781,  750,  719,  688,  657,  626,  595,
+ 564,  533,  502,  471,  440,  409,  378,  347,  316,  285,  254,  223,  999,
+ 968,  937,  906,  875,  844,  813,  782,  751,  720,  689,  658,  627,  596,
+ 565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255, 1000,  969,
+ 938,  907,  876,  845,  814,  783,  752,  721,  690,  659,  628,  597,  566,
+ 535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  908,
+ 877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,
+ 474,  443,  412,  381,  350,  319, 1002,  971,  940,  909,  878,  847,  816,
+ 785,  754,  723,  692,  661,  630,  599,  568,  537,  506,  475,  444,  413,
+ 382,  351, 1003,  972,  941,  910,  879,  848,  817,  786,  755,  724,  693,
+ 662,  631,  600,  569,  538,  507,  476,  445,  414,  383, 1004,  973,  942,
+ 911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,
+ 508,  477,  446,  415, 1005,  974,  943,  912,  881,  850,  819,  788,  757,
+ 726,  695,  664,  633,  602,  571,  540,  509,  478,  447, 1006,  975,  944,
+ 913,  882,  851,  820,  789,  758,  727,  696,  665,  634,  603,  572,  541,
+ 510,  479, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,
+ 666,  635,  604,  573,  542,  511, 1008,  977,  946,  915,  884,  853,  822,
+ 791,  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  916,
+ 885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575, 1010,  979,
+ 948,  917,  886,  855,  824,  793,  762,  731,  700,  669,  638,  607, 1011,
+ 980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639, 1012,
+ 981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,
+ 951,  920,  889,  858,  827,  796,  765,  734,  703, 1014,  983,  952,  921,
+ 890,  859,  828,  797,  766,  735, 1015,  984,  953,  922,  891,  860,  829,
+ 798,  767, 1016,  985,  954,  923,  892,  861,  830,  799, 1017,  986,  955,
+ 924,  893,  862,  831, 1018,  987,  956,  925,  894,  863, 1019,  988,  957,
+ 926,  895, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,    0, // 32x64
+  32,    1,   64,   33,    2,   96,   65,   34,    3,  128,   97,   66,   35,
+   4,  160,  129,   98,   67,   36,    5,  192,  161,  130,   99,   68,   37,
+   6,  224,  193,  162,  131,  100,   69,   38,    7,  256,  225,  194,  163,
+ 132,  101,   70,   39,    8,  288,  257,  226,  195,  164,  133,  102,   71,
+  40,    9,  320,  289,  258,  227,  196,  165,  134,  103,   72,   41,   10,
+ 352,  321,  290,  259,  228,  197,  166,  135,  104,   73,   42,   11,  384,
+ 353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,  416,
+ 385,  354,  323,  292,  261,  230,  199,  168,  137,  106,   75,   44,   13,
+ 448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,
+  45,   14,  480,  449,  418,  387,  356,  325,  294,  263,  232,  201,  170,
+ 139,  108,   77,   46,   15,  512,  481,  450,  419,  388,  357,  326,  295,
+ 264,  233,  202,  171,  140,  109,   78,   47,   16,  544,  513,  482,  451,
+ 420,  389,  358,  327,  296,  265,  234,  203,  172,  141,  110,   79,   48,
+  17,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,
+ 204,  173,  142,  111,   80,   49,   18,  608,  577,  546,  515,  484,  453,
+ 422,  391,  360,  329,  298,  267,  236,  205,  174,  143,  112,   81,   50,
+  19,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,
+ 268,  237,  206,  175,  144,  113,   82,   51,   20,  672,  641,  610,  579,
+ 548,  517,  486,  455,  424,  393,  362,  331,  300,  269,  238,  207,  176,
+ 145,  114,   83,   52,   21,  704,  673,  642,  611,  580,  549,  518,  487,
+ 456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,
+  53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,
+ 395,  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,
+ 768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,
+ 365,  334,  303,  272,  241,  210,  179,  148,  117,   86,   55,   24,  800,
+ 769,  738,  707,  676,  645,  614,  583,  552,  521,  490,  459,  428,  397,
+ 366,  335,  304,  273,  242,  211,  180,  149,  118,   87,   56,   25,  832,
+ 801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,
+ 398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,
+ 864,  833,  802,  771,  740,  709,  678,  647,  616,  585,  554,  523,  492,
+ 461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,
+  58,   27,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,  586,
+ 555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,
+ 152,  121,   90,   59,   28,  928,  897,  866,  835,  804,  773,  742,  711,
+ 680,  649,  618,  587,  556,  525,  494,  463,  432,  401,  370,  339,  308,
+ 277,  246,  215,  184,  153,  122,   91,   60,   29,  960,  929,  898,  867,
+ 836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,  495,  464,
+ 433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,
+  30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,
+ 620,  589,  558,  527,  496,  465,  434,  403,  372,  341,  310,  279,  248,
+ 217,  186,  155,  124,   93,   62,   31, 1024,  993,  962,  931,  900,  869,
+ 838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,  497,  466,
+ 435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,
+1056, 1025,  994,  963,  932,  901,  870,  839,  808,  777,  746,  715,  684,
+ 653,  622,  591,  560,  529,  498,  467,  436,  405,  374,  343,  312,  281,
+ 250,  219,  188,  157,  126,   95, 1088, 1057, 1026,  995,  964,  933,  902,
+ 871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,
+ 468,  437,  406,  375,  344,  313,  282,  251,  220,  189,  158,  127, 1120,
+1089, 1058, 1027,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,
+ 686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,  314,
+ 283,  252,  221,  190,  159, 1152, 1121, 1090, 1059, 1028,  997,  966,  935,
+ 904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,
+ 501,  470,  439,  408,  377,  346,  315,  284,  253,  222,  191, 1184, 1153,
+1122, 1091, 1060, 1029,  998,  967,  936,  905,  874,  843,  812,  781,  750,
+ 719,  688,  657,  626,  595,  564,  533,  502,  471,  440,  409,  378,  347,
+ 316,  285,  254,  223, 1216, 1185, 1154, 1123, 1092, 1061, 1030,  999,  968,
+ 937,  906,  875,  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,
+ 534,  503,  472,  441,  410,  379,  348,  317,  286,  255, 1248, 1217, 1186,
+1155, 1124, 1093, 1062, 1031, 1000,  969,  938,  907,  876,  845,  814,  783,
+ 752,  721,  690,  659,  628,  597,  566,  535,  504,  473,  442,  411,  380,
+ 349,  318,  287, 1280, 1249, 1218, 1187, 1156, 1125, 1094, 1063, 1032, 1001,
+ 970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,
+ 567,  536,  505,  474,  443,  412,  381,  350,  319, 1312, 1281, 1250, 1219,
+1188, 1157, 1126, 1095, 1064, 1033, 1002,  971,  940,  909,  878,  847,  816,
+ 785,  754,  723,  692,  661,  630,  599,  568,  537,  506,  475,  444,  413,
+ 382,  351, 1344, 1313, 1282, 1251, 1220, 1189, 1158, 1127, 1096, 1065, 1034,
+1003,  972,  941,  910,  879,  848,  817,  786,  755,  724,  693,  662,  631,
+ 600,  569,  538,  507,  476,  445,  414,  383, 1376, 1345, 1314, 1283, 1252,
+1221, 1190, 1159, 1128, 1097, 1066, 1035, 1004,  973,  942,  911,  880,  849,
+ 818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,  446,
+ 415, 1408, 1377, 1346, 1315, 1284, 1253, 1222, 1191, 1160, 1129, 1098, 1067,
+1036, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,
+ 633,  602,  571,  540,  509,  478,  447, 1440, 1409, 1378, 1347, 1316, 1285,
+1254, 1223, 1192, 1161, 1130, 1099, 1068, 1037, 1006,  975,  944,  913,  882,
+ 851,  820,  789,  758,  727,  696,  665,  634,  603,  572,  541,  510,  479,
+1472, 1441, 1410, 1379, 1348, 1317, 1286, 1255, 1224, 1193, 1162, 1131, 1100,
+1069, 1038, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,
+ 666,  635,  604,  573,  542,  511, 1504, 1473, 1442, 1411, 1380, 1349, 1318,
+1287, 1256, 1225, 1194, 1163, 1132, 1101, 1070, 1039, 1008,  977,  946,  915,
+ 884,  853,  822,  791,  760,  729,  698,  667,  636,  605,  574,  543, 1536,
+1505, 1474, 1443, 1412, 1381, 1350, 1319, 1288, 1257, 1226, 1195, 1164, 1133,
+1102, 1071, 1040, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,
+ 699,  668,  637,  606,  575, 1568, 1537, 1506, 1475, 1444, 1413, 1382, 1351,
+1320, 1289, 1258, 1227, 1196, 1165, 1134, 1103, 1072, 1041, 1010,  979,  948,
+ 917,  886,  855,  824,  793,  762,  731,  700,  669,  638,  607, 1600, 1569,
+1538, 1507, 1476, 1445, 1414, 1383, 1352, 1321, 1290, 1259, 1228, 1197, 1166,
+1135, 1104, 1073, 1042, 1011,  980,  949,  918,  887,  856,  825,  794,  763,
+ 732,  701,  670,  639, 1632, 1601, 1570, 1539, 1508, 1477, 1446, 1415, 1384,
+1353, 1322, 1291, 1260, 1229, 1198, 1167, 1136, 1105, 1074, 1043, 1012,  981,
+ 950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1664, 1633, 1602,
+1571, 1540, 1509, 1478, 1447, 1416, 1385, 1354, 1323, 1292, 1261, 1230, 1199,
+1168, 1137, 1106, 1075, 1044, 1013,  982,  951,  920,  889,  858,  827,  796,
+ 765,  734,  703, 1696, 1665, 1634, 1603, 1572, 1541, 1510, 1479, 1448, 1417,
+1386, 1355, 1324, 1293, 1262, 1231, 1200, 1169, 1138, 1107, 1076, 1045, 1014,
+ 983,  952,  921,  890,  859,  828,  797,  766,  735, 1728, 1697, 1666, 1635,
+1604, 1573, 1542, 1511, 1480, 1449, 1418, 1387, 1356, 1325, 1294, 1263, 1232,
+1201, 1170, 1139, 1108, 1077, 1046, 1015,  984,  953,  922,  891,  860,  829,
+ 798,  767, 1760, 1729, 1698, 1667, 1636, 1605, 1574, 1543, 1512, 1481, 1450,
+1419, 1388, 1357, 1326, 1295, 1264, 1233, 1202, 1171, 1140, 1109, 1078, 1047,
+1016,  985,  954,  923,  892,  861,  830,  799, 1792, 1761, 1730, 1699, 1668,
+1637, 1606, 1575, 1544, 1513, 1482, 1451, 1420, 1389, 1358, 1327, 1296, 1265,
+1234, 1203, 1172, 1141, 1110, 1079, 1048, 1017,  986,  955,  924,  893,  862,
+ 831, 1824, 1793, 1762, 1731, 1700, 1669, 1638, 1607, 1576, 1545, 1514, 1483,
+1452, 1421, 1390, 1359, 1328, 1297, 1266, 1235, 1204, 1173, 1142, 1111, 1080,
+1049, 1018,  987,  956,  925,  894,  863, 1856, 1825, 1794, 1763, 1732, 1701,
+1670, 1639, 1608, 1577, 1546, 1515, 1484, 1453, 1422, 1391, 1360, 1329, 1298,
+1267, 1236, 1205, 1174, 1143, 1112, 1081, 1050, 1019,  988,  957,  926,  895,
+1888, 1857, 1826, 1795, 1764, 1733, 1702, 1671, 1640, 1609, 1578, 1547, 1516,
+1485, 1454, 1423, 1392, 1361, 1330, 1299, 1268, 1237, 1206, 1175, 1144, 1113,
+1082, 1051, 1020,  989,  958,  927, 1920, 1889, 1858, 1827, 1796, 1765, 1734,
+1703, 1672, 1641, 1610, 1579, 1548, 1517, 1486, 1455, 1424, 1393, 1362, 1331,
+1300, 1269, 1238, 1207, 1176, 1145, 1114, 1083, 1052, 1021,  990,  959, 1952,
+1921, 1890, 1859, 1828, 1797, 1766, 1735, 1704, 1673, 1642, 1611, 1580, 1549,
+1518, 1487, 1456, 1425, 1394, 1363, 1332, 1301, 1270, 1239, 1208, 1177, 1146,
+1115, 1084, 1053, 1022,  991, 1984, 1953, 1922, 1891, 1860, 1829, 1798, 1767,
+1736, 1705, 1674, 1643, 1612, 1581, 1550, 1519, 1488, 1457, 1426, 1395, 1364,
+1333, 1302, 1271, 1240, 1209, 1178, 1147, 1116, 1085, 1054, 1023, 2016, 1985,
+1954, 1923, 1892, 1861, 1830, 1799, 1768, 1737, 1706, 1675, 1644, 1613, 1582,
+1551, 1520, 1489, 1458, 1427, 1396, 1365, 1334, 1303, 1272, 1241, 1210, 1179,
+1148, 1117, 1086, 1055, 2017, 1986, 1955, 1924, 1893, 1862, 1831, 1800, 1769,
+1738, 1707, 1676, 1645, 1614, 1583, 1552, 1521, 1490, 1459, 1428, 1397, 1366,
+1335, 1304, 1273, 1242, 1211, 1180, 1149, 1118, 1087, 2018, 1987, 1956, 1925,
+1894, 1863, 1832, 1801, 1770, 1739, 1708, 1677, 1646, 1615, 1584, 1553, 1522,
+1491, 1460, 1429, 1398, 1367, 1336, 1305, 1274, 1243, 1212, 1181, 1150, 1119,
+2019, 1988, 1957, 1926, 1895, 1864, 1833, 1802, 1771, 1740, 1709, 1678, 1647,
+1616, 1585, 1554, 1523, 1492, 1461, 1430, 1399, 1368, 1337, 1306, 1275, 1244,
+1213, 1182, 1151, 2020, 1989, 1958, 1927, 1896, 1865, 1834, 1803, 1772, 1741,
+1710, 1679, 1648, 1617, 1586, 1555, 1524, 1493, 1462, 1431, 1400, 1369, 1338,
+1307, 1276, 1245, 1214, 1183, 2021, 1990, 1959, 1928, 1897, 1866, 1835, 1804,
+1773, 1742, 1711, 1680, 1649, 1618, 1587, 1556, 1525, 1494, 1463, 1432, 1401,
+1370, 1339, 1308, 1277, 1246, 1215, 2022, 1991, 1960, 1929, 1898, 1867, 1836,
+1805, 1774, 1743, 1712, 1681, 1650, 1619, 1588, 1557, 1526, 1495, 1464, 1433,
+1402, 1371, 1340, 1309, 1278, 1247, 2023, 1992, 1961, 1930, 1899, 1868, 1837,
+1806, 1775, 1744, 1713, 1682, 1651, 1620, 1589, 1558, 1527, 1496, 1465, 1434,
+1403, 1372, 1341, 1310, 1279, 2024, 1993, 1962, 1931, 1900, 1869, 1838, 1807,
+1776, 1745, 1714, 1683, 1652, 1621, 1590, 1559, 1528, 1497, 1466, 1435, 1404,
+1373, 1342, 1311, 2025, 1994, 1963, 1932, 1901, 1870, 1839, 1808, 1777, 1746,
+1715, 1684, 1653, 1622, 1591, 1560, 1529, 1498, 1467, 1436, 1405, 1374, 1343,
+2026, 1995, 1964, 1933, 1902, 1871, 1840, 1809, 1778, 1747, 1716, 1685, 1654,
+1623, 1592, 1561, 1530, 1499, 1468, 1437, 1406, 1375, 2027, 1996, 1965, 1934,
+1903, 1872, 1841, 1810, 1779, 1748, 1717, 1686, 1655, 1624, 1593, 1562, 1531,
+1500, 1469, 1438, 1407, 2028, 1997, 1966, 1935, 1904, 1873, 1842, 1811, 1780,
+1749, 1718, 1687, 1656, 1625, 1594, 1563, 1532, 1501, 1470, 1439, 2029, 1998,
+1967, 1936, 1905, 1874, 1843, 1812, 1781, 1750, 1719, 1688, 1657, 1626, 1595,
+1564, 1533, 1502, 1471, 2030, 1999, 1968, 1937, 1906, 1875, 1844, 1813, 1782,
+1751, 1720, 1689, 1658, 1627, 1596, 1565, 1534, 1503, 2031, 2000, 1969, 1938,
+1907, 1876, 1845, 1814, 1783, 1752, 1721, 1690, 1659, 1628, 1597, 1566, 1535,
+2032, 2001, 1970, 1939, 1908, 1877, 1846, 1815, 1784, 1753, 1722, 1691, 1660,
+1629, 1598, 1567, 2033, 2002, 1971, 1940, 1909, 1878, 1847, 1816, 1785, 1754,
+1723, 1692, 1661, 1630, 1599, 2034, 2003, 1972, 1941, 1910, 1879, 1848, 1817,
+1786, 1755, 1724, 1693, 1662, 1631, 2035, 2004, 1973, 1942, 1911, 1880, 1849,
+1818, 1787, 1756, 1725, 1694, 1663, 2036, 2005, 1974, 1943, 1912, 1881, 1850,
+1819, 1788, 1757, 1726, 1695, 2037, 2006, 1975, 1944, 1913, 1882, 1851, 1820,
+1789, 1758, 1727, 2038, 2007, 1976, 1945, 1914, 1883, 1852, 1821, 1790, 1759,
+2039, 2008, 1977, 1946, 1915, 1884, 1853, 1822, 1791, 2040, 2009, 1978, 1947,
+1916, 1885, 1854, 1823, 2041, 2010, 1979, 1948, 1917, 1886, 1855, 2042, 2011,
+1980, 1949, 1918, 1887, 2043, 2012, 1981, 1950, 1919, 2044, 2013, 1982, 1951,
+2045, 2014, 1983, 2046, 2015, 2047,    0,    1,    2,    3,    4,    5,    6, // 64xN
+   7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+  20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
+  33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
+  46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+  59,   60,   61,   62,   63,    0,   64,    1,   65,    2,   66,    3,   67, // 64x2
+   4,   68,    5,   69,    6,   70,    7,   71,    8,   72,    9,   73,   10,
+  74,   11,   75,   12,   76,   13,   77,   14,   78,   15,   79,   16,   80,
+  17,   81,   18,   82,   19,   83,   20,   84,   21,   85,   22,   86,   23,
+  87,   24,   88,   25,   89,   26,   90,   27,   91,   28,   92,   29,   93,
+  30,   94,   31,   95,   32,   96,   33,   97,   34,   98,   35,   99,   36,
+ 100,   37,  101,   38,  102,   39,  103,   40,  104,   41,  105,   42,  106,
+  43,  107,   44,  108,   45,  109,   46,  110,   47,  111,   48,  112,   49,
+ 113,   50,  114,   51,  115,   52,  116,   53,  117,   54,  118,   55,  119,
+  56,  120,   57,  121,   58,  122,   59,  123,   60,  124,   61,  125,   62,
+ 126,   63,  127,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3, // 64x4
+ 193,  130,   67,    4,  194,  131,   68,    5,  195,  132,   69,    6,  196,
+ 133,   70,    7,  197,  134,   71,    8,  198,  135,   72,    9,  199,  136,
+  73,   10,  200,  137,   74,   11,  201,  138,   75,   12,  202,  139,   76,
+  13,  203,  140,   77,   14,  204,  141,   78,   15,  205,  142,   79,   16,
+ 206,  143,   80,   17,  207,  144,   81,   18,  208,  145,   82,   19,  209,
+ 146,   83,   20,  210,  147,   84,   21,  211,  148,   85,   22,  212,  149,
+  86,   23,  213,  150,   87,   24,  214,  151,   88,   25,  215,  152,   89,
+  26,  216,  153,   90,   27,  217,  154,   91,   28,  218,  155,   92,   29,
+ 219,  156,   93,   30,  220,  157,   94,   31,  221,  158,   95,   32,  222,
+ 159,   96,   33,  223,  160,   97,   34,  224,  161,   98,   35,  225,  162,
+  99,   36,  226,  163,  100,   37,  227,  164,  101,   38,  228,  165,  102,
+  39,  229,  166,  103,   40,  230,  167,  104,   41,  231,  168,  105,   42,
+ 232,  169,  106,   43,  233,  170,  107,   44,  234,  171,  108,   45,  235,
+ 172,  109,   46,  236,  173,  110,   47,  237,  174,  111,   48,  238,  175,
+ 112,   49,  239,  176,  113,   50,  240,  177,  114,   51,  241,  178,  115,
+  52,  242,  179,  116,   53,  243,  180,  117,   54,  244,  181,  118,   55,
+ 245,  182,  119,   56,  246,  183,  120,   57,  247,  184,  121,   58,  248,
+ 185,  122,   59,  249,  186,  123,   60,  250,  187,  124,   61,  251,  188,
+ 125,   62,  252,  189,  126,   63,  253,  190,  127,  254,  191,  255,    0, // 64x8
+  64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193,  130,   67,
+   4,  320,  257,  194,  131,   68,    5,  384,  321,  258,  195,  132,   69,
+   6,  448,  385,  322,  259,  196,  133,   70,    7,  449,  386,  323,  260,
+ 197,  134,   71,    8,  450,  387,  324,  261,  198,  135,   72,    9,  451,
+ 388,  325,  262,  199,  136,   73,   10,  452,  389,  326,  263,  200,  137,
+  74,   11,  453,  390,  327,  264,  201,  138,   75,   12,  454,  391,  328,
+ 265,  202,  139,   76,   13,  455,  392,  329,  266,  203,  140,   77,   14,
+ 456,  393,  330,  267,  204,  141,   78,   15,  457,  394,  331,  268,  205,
+ 142,   79,   16,  458,  395,  332,  269,  206,  143,   80,   17,  459,  396,
+ 333,  270,  207,  144,   81,   18,  460,  397,  334,  271,  208,  145,   82,
+  19,  461,  398,  335,  272,  209,  146,   83,   20,  462,  399,  336,  273,
+ 210,  147,   84,   21,  463,  400,  337,  274,  211,  148,   85,   22,  464,
+ 401,  338,  275,  212,  149,   86,   23,  465,  402,  339,  276,  213,  150,
+  87,   24,  466,  403,  340,  277,  214,  151,   88,   25,  467,  404,  341,
+ 278,  215,  152,   89,   26,  468,  405,  342,  279,  216,  153,   90,   27,
+ 469,  406,  343,  280,  217,  154,   91,   28,  470,  407,  344,  281,  218,
+ 155,   92,   29,  471,  408,  345,  282,  219,  156,   93,   30,  472,  409,
+ 346,  283,  220,  157,   94,   31,  473,  410,  347,  284,  221,  158,   95,
+  32,  474,  411,  348,  285,  222,  159,   96,   33,  475,  412,  349,  286,
+ 223,  160,   97,   34,  476,  413,  350,  287,  224,  161,   98,   35,  477,
+ 414,  351,  288,  225,  162,   99,   36,  478,  415,  352,  289,  226,  163,
+ 100,   37,  479,  416,  353,  290,  227,  164,  101,   38,  480,  417,  354,
+ 291,  228,  165,  102,   39,  481,  418,  355,  292,  229,  166,  103,   40,
+ 482,  419,  356,  293,  230,  167,  104,   41,  483,  420,  357,  294,  231,
+ 168,  105,   42,  484,  421,  358,  295,  232,  169,  106,   43,  485,  422,
+ 359,  296,  233,  170,  107,   44,  486,  423,  360,  297,  234,  171,  108,
+  45,  487,  424,  361,  298,  235,  172,  109,   46,  488,  425,  362,  299,
+ 236,  173,  110,   47,  489,  426,  363,  300,  237,  174,  111,   48,  490,
+ 427,  364,  301,  238,  175,  112,   49,  491,  428,  365,  302,  239,  176,
+ 113,   50,  492,  429,  366,  303,  240,  177,  114,   51,  493,  430,  367,
+ 304,  241,  178,  115,   52,  494,  431,  368,  305,  242,  179,  116,   53,
+ 495,  432,  369,  306,  243,  180,  117,   54,  496,  433,  370,  307,  244,
+ 181,  118,   55,  497,  434,  371,  308,  245,  182,  119,   56,  498,  435,
+ 372,  309,  246,  183,  120,   57,  499,  436,  373,  310,  247,  184,  121,
+  58,  500,  437,  374,  311,  248,  185,  122,   59,  501,  438,  375,  312,
+ 249,  186,  123,   60,  502,  439,  376,  313,  250,  187,  124,   61,  503,
+ 440,  377,  314,  251,  188,  125,   62,  504,  441,  378,  315,  252,  189,
+ 126,   63,  505,  442,  379,  316,  253,  190,  127,  506,  443,  380,  317,
+ 254,  191,  507,  444,  381,  318,  255,  508,  445,  382,  319,  509,  446,
+ 383,  510,  447,  511,    0,   64,    1,  128,   65,    2,  192,  129,   66, // 64x16
+   3,  256,  193,  130,   67,    4,  320,  257,  194,  131,   68,    5,  384,
+ 321,  258,  195,  132,   69,    6,  448,  385,  322,  259,  196,  133,   70,
+   7,  512,  449,  386,  323,  260,  197,  134,   71,    8,  576,  513,  450,
+ 387,  324,  261,  198,  135,   72,    9,  640,  577,  514,  451,  388,  325,
+ 262,  199,  136,   73,   10,  704,  641,  578,  515,  452,  389,  326,  263,
+ 200,  137,   74,   11,  768,  705,  642,  579,  516,  453,  390,  327,  264,
+ 201,  138,   75,   12,  832,  769,  706,  643,  580,  517,  454,  391,  328,
+ 265,  202,  139,   76,   13,  896,  833,  770,  707,  644,  581,  518,  455,
+ 392,  329,  266,  203,  140,   77,   14,  960,  897,  834,  771,  708,  645,
+ 582,  519,  456,  393,  330,  267,  204,  141,   78,   15,  961,  898,  835,
+ 772,  709,  646,  583,  520,  457,  394,  331,  268,  205,  142,   79,   16,
+ 962,  899,  836,  773,  710,  647,  584,  521,  458,  395,  332,  269,  206,
+ 143,   80,   17,  963,  900,  837,  774,  711,  648,  585,  522,  459,  396,
+ 333,  270,  207,  144,   81,   18,  964,  901,  838,  775,  712,  649,  586,
+ 523,  460,  397,  334,  271,  208,  145,   82,   19,  965,  902,  839,  776,
+ 713,  650,  587,  524,  461,  398,  335,  272,  209,  146,   83,   20,  966,
+ 903,  840,  777,  714,  651,  588,  525,  462,  399,  336,  273,  210,  147,
+  84,   21,  967,  904,  841,  778,  715,  652,  589,  526,  463,  400,  337,
+ 274,  211,  148,   85,   22,  968,  905,  842,  779,  716,  653,  590,  527,
+ 464,  401,  338,  275,  212,  149,   86,   23,  969,  906,  843,  780,  717,
+ 654,  591,  528,  465,  402,  339,  276,  213,  150,   87,   24,  970,  907,
+ 844,  781,  718,  655,  592,  529,  466,  403,  340,  277,  214,  151,   88,
+  25,  971,  908,  845,  782,  719,  656,  593,  530,  467,  404,  341,  278,
+ 215,  152,   89,   26,  972,  909,  846,  783,  720,  657,  594,  531,  468,
+ 405,  342,  279,  216,  153,   90,   27,  973,  910,  847,  784,  721,  658,
+ 595,  532,  469,  406,  343,  280,  217,  154,   91,   28,  974,  911,  848,
+ 785,  722,  659,  596,  533,  470,  407,  344,  281,  218,  155,   92,   29,
+ 975,  912,  849,  786,  723,  660,  597,  534,  471,  408,  345,  282,  219,
+ 156,   93,   30,  976,  913,  850,  787,  724,  661,  598,  535,  472,  409,
+ 346,  283,  220,  157,   94,   31,  977,  914,  851,  788,  725,  662,  599,
+ 536,  473,  410,  347,  284,  221,  158,   95,   32,  978,  915,  852,  789,
+ 726,  663,  600,  537,  474,  411,  348,  285,  222,  159,   96,   33,  979,
+ 916,  853,  790,  727,  664,  601,  538,  475,  412,  349,  286,  223,  160,
+  97,   34,  980,  917,  854,  791,  728,  665,  602,  539,  476,  413,  350,
+ 287,  224,  161,   98,   35,  981,  918,  855,  792,  729,  666,  603,  540,
+ 477,  414,  351,  288,  225,  162,   99,   36,  982,  919,  856,  793,  730,
+ 667,  604,  541,  478,  415,  352,  289,  226,  163,  100,   37,  983,  920,
+ 857,  794,  731,  668,  605,  542,  479,  416,  353,  290,  227,  164,  101,
+  38,  984,  921,  858,  795,  732,  669,  606,  543,  480,  417,  354,  291,
+ 228,  165,  102,   39,  985,  922,  859,  796,  733,  670,  607,  544,  481,
+ 418,  355,  292,  229,  166,  103,   40,  986,  923,  860,  797,  734,  671,
+ 608,  545,  482,  419,  356,  293,  230,  167,  104,   41,  987,  924,  861,
+ 798,  735,  672,  609,  546,  483,  420,  357,  294,  231,  168,  105,   42,
+ 988,  925,  862,  799,  736,  673,  610,  547,  484,  421,  358,  295,  232,
+ 169,  106,   43,  989,  926,  863,  800,  737,  674,  611,  548,  485,  422,
+ 359,  296,  233,  170,  107,   44,  990,  927,  864,  801,  738,  675,  612,
+ 549,  486,  423,  360,  297,  234,  171,  108,   45,  991,  928,  865,  802,
+ 739,  676,  613,  550,  487,  424,  361,  298,  235,  172,  109,   46,  992,
+ 929,  866,  803,  740,  677,  614,  551,  488,  425,  362,  299,  236,  173,
+ 110,   47,  993,  930,  867,  804,  741,  678,  615,  552,  489,  426,  363,
+ 300,  237,  174,  111,   48,  994,  931,  868,  805,  742,  679,  616,  553,
+ 490,  427,  364,  301,  238,  175,  112,   49,  995,  932,  869,  806,  743,
+ 680,  617,  554,  491,  428,  365,  302,  239,  176,  113,   50,  996,  933,
+ 870,  807,  744,  681,  618,  555,  492,  429,  366,  303,  240,  177,  114,
+  51,  997,  934,  871,  808,  745,  682,  619,  556,  493,  430,  367,  304,
+ 241,  178,  115,   52,  998,  935,  872,  809,  746,  683,  620,  557,  494,
+ 431,  368,  305,  242,  179,  116,   53,  999,  936,  873,  810,  747,  684,
+ 621,  558,  495,  432,  369,  306,  243,  180,  117,   54, 1000,  937,  874,
+ 811,  748,  685,  622,  559,  496,  433,  370,  307,  244,  181,  118,   55,
+1001,  938,  875,  812,  749,  686,  623,  560,  497,  434,  371,  308,  245,
+ 182,  119,   56, 1002,  939,  876,  813,  750,  687,  624,  561,  498,  435,
+ 372,  309,  246,  183,  120,   57, 1003,  940,  877,  814,  751,  688,  625,
+ 562,  499,  436,  373,  310,  247,  184,  121,   58, 1004,  941,  878,  815,
+ 752,  689,  626,  563,  500,  437,  374,  311,  248,  185,  122,   59, 1005,
+ 942,  879,  816,  753,  690,  627,  564,  501,  438,  375,  312,  249,  186,
+ 123,   60, 1006,  943,  880,  817,  754,  691,  628,  565,  502,  439,  376,
+ 313,  250,  187,  124,   61, 1007,  944,  881,  818,  755,  692,  629,  566,
+ 503,  440,  377,  314,  251,  188,  125,   62, 1008,  945,  882,  819,  756,
+ 693,  630,  567,  504,  441,  378,  315,  252,  189,  126,   63, 1009,  946,
+ 883,  820,  757,  694,  631,  568,  505,  442,  379,  316,  253,  190,  127,
+1010,  947,  884,  821,  758,  695,  632,  569,  506,  443,  380,  317,  254,
+ 191, 1011,  948,  885,  822,  759,  696,  633,  570,  507,  444,  381,  318,
+ 255, 1012,  949,  886,  823,  760,  697,  634,  571,  508,  445,  382,  319,
+1013,  950,  887,  824,  761,  698,  635,  572,  509,  446,  383, 1014,  951,
+ 888,  825,  762,  699,  636,  573,  510,  447, 1015,  952,  889,  826,  763,
+ 700,  637,  574,  511, 1016,  953,  890,  827,  764,  701,  638,  575, 1017,
+ 954,  891,  828,  765,  702,  639, 1018,  955,  892,  829,  766,  703, 1019,
+ 956,  893,  830,  767, 1020,  957,  894,  831, 1021,  958,  895, 1022,  959,
+1023,    0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  256,  193, // 64x32
+ 130,   67,    4,  320,  257,  194,  131,   68,    5,  384,  321,  258,  195,
+ 132,   69,    6,  448,  385,  322,  259,  196,  133,   70,    7,  512,  449,
+ 386,  323,  260,  197,  134,   71,    8,  576,  513,  450,  387,  324,  261,
+ 198,  135,   72,    9,  640,  577,  514,  451,  388,  325,  262,  199,  136,
+  73,   10,  704,  641,  578,  515,  452,  389,  326,  263,  200,  137,   74,
+  11,  768,  705,  642,  579,  516,  453,  390,  327,  264,  201,  138,   75,
+  12,  832,  769,  706,  643,  580,  517,  454,  391,  328,  265,  202,  139,
+  76,   13,  896,  833,  770,  707,  644,  581,  518,  455,  392,  329,  266,
+ 203,  140,   77,   14,  960,  897,  834,  771,  708,  645,  582,  519,  456,
+ 393,  330,  267,  204,  141,   78,   15, 1024,  961,  898,  835,  772,  709,
+ 646,  583,  520,  457,  394,  331,  268,  205,  142,   79,   16, 1088, 1025,
+ 962,  899,  836,  773,  710,  647,  584,  521,  458,  395,  332,  269,  206,
+ 143,   80,   17, 1152, 1089, 1026,  963,  900,  837,  774,  711,  648,  585,
+ 522,  459,  396,  333,  270,  207,  144,   81,   18, 1216, 1153, 1090, 1027,
+ 964,  901,  838,  775,  712,  649,  586,  523,  460,  397,  334,  271,  208,
+ 145,   82,   19, 1280, 1217, 1154, 1091, 1028,  965,  902,  839,  776,  713,
+ 650,  587,  524,  461,  398,  335,  272,  209,  146,   83,   20, 1344, 1281,
+1218, 1155, 1092, 1029,  966,  903,  840,  777,  714,  651,  588,  525,  462,
+ 399,  336,  273,  210,  147,   84,   21, 1408, 1345, 1282, 1219, 1156, 1093,
+1030,  967,  904,  841,  778,  715,  652,  589,  526,  463,  400,  337,  274,
+ 211,  148,   85,   22, 1472, 1409, 1346, 1283, 1220, 1157, 1094, 1031,  968,
+ 905,  842,  779,  716,  653,  590,  527,  464,  401,  338,  275,  212,  149,
+  86,   23, 1536, 1473, 1410, 1347, 1284, 1221, 1158, 1095, 1032,  969,  906,
+ 843,  780,  717,  654,  591,  528,  465,  402,  339,  276,  213,  150,   87,
+  24, 1600, 1537, 1474, 1411, 1348, 1285, 1222, 1159, 1096, 1033,  970,  907,
+ 844,  781,  718,  655,  592,  529,  466,  403,  340,  277,  214,  151,   88,
+  25, 1664, 1601, 1538, 1475, 1412, 1349, 1286, 1223, 1160, 1097, 1034,  971,
+ 908,  845,  782,  719,  656,  593,  530,  467,  404,  341,  278,  215,  152,
+  89,   26, 1728, 1665, 1602, 1539, 1476, 1413, 1350, 1287, 1224, 1161, 1098,
+1035,  972,  909,  846,  783,  720,  657,  594,  531,  468,  405,  342,  279,
+ 216,  153,   90,   27, 1792, 1729, 1666, 1603, 1540, 1477, 1414, 1351, 1288,
+1225, 1162, 1099, 1036,  973,  910,  847,  784,  721,  658,  595,  532,  469,
+ 406,  343,  280,  217,  154,   91,   28, 1856, 1793, 1730, 1667, 1604, 1541,
+1478, 1415, 1352, 1289, 1226, 1163, 1100, 1037,  974,  911,  848,  785,  722,
+ 659,  596,  533,  470,  407,  344,  281,  218,  155,   92,   29, 1920, 1857,
+1794, 1731, 1668, 1605, 1542, 1479, 1416, 1353, 1290, 1227, 1164, 1101, 1038,
+ 975,  912,  849,  786,  723,  660,  597,  534,  471,  408,  345,  282,  219,
+ 156,   93,   30, 1984, 1921, 1858, 1795, 1732, 1669, 1606, 1543, 1480, 1417,
+1354, 1291, 1228, 1165, 1102, 1039,  976,  913,  850,  787,  724,  661,  598,
+ 535,  472,  409,  346,  283,  220,  157,   94,   31, 1985, 1922, 1859, 1796,
+1733, 1670, 1607, 1544, 1481, 1418, 1355, 1292, 1229, 1166, 1103, 1040,  977,
+ 914,  851,  788,  725,  662,  599,  536,  473,  410,  347,  284,  221,  158,
+  95,   32, 1986, 1923, 1860, 1797, 1734, 1671, 1608, 1545, 1482, 1419, 1356,
+1293, 1230, 1167, 1104, 1041,  978,  915,  852,  789,  726,  663,  600,  537,
+ 474,  411,  348,  285,  222,  159,   96,   33, 1987, 1924, 1861, 1798, 1735,
+1672, 1609, 1546, 1483, 1420, 1357, 1294, 1231, 1168, 1105, 1042,  979,  916,
+ 853,  790,  727,  664,  601,  538,  475,  412,  349,  286,  223,  160,   97,
+  34, 1988, 1925, 1862, 1799, 1736, 1673, 1610, 1547, 1484, 1421, 1358, 1295,
+1232, 1169, 1106, 1043,  980,  917,  854,  791,  728,  665,  602,  539,  476,
+ 413,  350,  287,  224,  161,   98,   35, 1989, 1926, 1863, 1800, 1737, 1674,
+1611, 1548, 1485, 1422, 1359, 1296, 1233, 1170, 1107, 1044,  981,  918,  855,
+ 792,  729,  666,  603,  540,  477,  414,  351,  288,  225,  162,   99,   36,
+1990, 1927, 1864, 1801, 1738, 1675, 1612, 1549, 1486, 1423, 1360, 1297, 1234,
+1171, 1108, 1045,  982,  919,  856,  793,  730,  667,  604,  541,  478,  415,
+ 352,  289,  226,  163,  100,   37, 1991, 1928, 1865, 1802, 1739, 1676, 1613,
+1550, 1487, 1424, 1361, 1298, 1235, 1172, 1109, 1046,  983,  920,  857,  794,
+ 731,  668,  605,  542,  479,  416,  353,  290,  227,  164,  101,   38, 1992,
+1929, 1866, 1803, 1740, 1677, 1614, 1551, 1488, 1425, 1362, 1299, 1236, 1173,
+1110, 1047,  984,  921,  858,  795,  732,  669,  606,  543,  480,  417,  354,
+ 291,  228,  165,  102,   39, 1993, 1930, 1867, 1804, 1741, 1678, 1615, 1552,
+1489, 1426, 1363, 1300, 1237, 1174, 1111, 1048,  985,  922,  859,  796,  733,
+ 670,  607,  544,  481,  418,  355,  292,  229,  166,  103,   40, 1994, 1931,
+1868, 1805, 1742, 1679, 1616, 1553, 1490, 1427, 1364, 1301, 1238, 1175, 1112,
+1049,  986,  923,  860,  797,  734,  671,  608,  545,  482,  419,  356,  293,
+ 230,  167,  104,   41, 1995, 1932, 1869, 1806, 1743, 1680, 1617, 1554, 1491,
+1428, 1365, 1302, 1239, 1176, 1113, 1050,  987,  924,  861,  798,  735,  672,
+ 609,  546,  483,  420,  357,  294,  231,  168,  105,   42, 1996, 1933, 1870,
+1807, 1744, 1681, 1618, 1555, 1492, 1429, 1366, 1303, 1240, 1177, 1114, 1051,
+ 988,  925,  862,  799,  736,  673,  610,  547,  484,  421,  358,  295,  232,
+ 169,  106,   43, 1997, 1934, 1871, 1808, 1745, 1682, 1619, 1556, 1493, 1430,
+1367, 1304, 1241, 1178, 1115, 1052,  989,  926,  863,  800,  737,  674,  611,
+ 548,  485,  422,  359,  296,  233,  170,  107,   44, 1998, 1935, 1872, 1809,
+1746, 1683, 1620, 1557, 1494, 1431, 1368, 1305, 1242, 1179, 1116, 1053,  990,
+ 927,  864,  801,  738,  675,  612,  549,  486,  423,  360,  297,  234,  171,
+ 108,   45, 1999, 1936, 1873, 1810, 1747, 1684, 1621, 1558, 1495, 1432, 1369,
+1306, 1243, 1180, 1117, 1054,  991,  928,  865,  802,  739,  676,  613,  550,
+ 487,  424,  361,  298,  235,  172,  109,   46, 2000, 1937, 1874, 1811, 1748,
+1685, 1622, 1559, 1496, 1433, 1370, 1307, 1244, 1181, 1118, 1055,  992,  929,
+ 866,  803,  740,  677,  614,  551,  488,  425,  362,  299,  236,  173,  110,
+  47, 2001, 1938, 1875, 1812, 1749, 1686, 1623, 1560, 1497, 1434, 1371, 1308,
+1245, 1182, 1119, 1056,  993,  930,  867,  804,  741,  678,  615,  552,  489,
+ 426,  363,  300,  237,  174,  111,   48, 2002, 1939, 1876, 1813, 1750, 1687,
+1624, 1561, 1498, 1435, 1372, 1309, 1246, 1183, 1120, 1057,  994,  931,  868,
+ 805,  742,  679,  616,  553,  490,  427,  364,  301,  238,  175,  112,   49,
+2003, 1940, 1877, 1814, 1751, 1688, 1625, 1562, 1499, 1436, 1373, 1310, 1247,
+1184, 1121, 1058,  995,  932,  869,  806,  743,  680,  617,  554,  491,  428,
+ 365,  302,  239,  176,  113,   50, 2004, 1941, 1878, 1815, 1752, 1689, 1626,
+1563, 1500, 1437, 1374, 1311, 1248, 1185, 1122, 1059,  996,  933,  870,  807,
+ 744,  681,  618,  555,  492,  429,  366,  303,  240,  177,  114,   51, 2005,
+1942, 1879, 1816, 1753, 1690, 1627, 1564, 1501, 1438, 1375, 1312, 1249, 1186,
+1123, 1060,  997,  934,  871,  808,  745,  682,  619,  556,  493,  430,  367,
+ 304,  241,  178,  115,   52, 2006, 1943, 1880, 1817, 1754, 1691, 1628, 1565,
+1502, 1439, 1376, 1313, 1250, 1187, 1124, 1061,  998,  935,  872,  809,  746,
+ 683,  620,  557,  494,  431,  368,  305,  242,  179,  116,   53, 2007, 1944,
+1881, 1818, 1755, 1692, 1629, 1566, 1503, 1440, 1377, 1314, 1251, 1188, 1125,
+1062,  999,  936,  873,  810,  747,  684,  621,  558,  495,  432,  369,  306,
+ 243,  180,  117,   54, 2008, 1945, 1882, 1819, 1756, 1693, 1630, 1567, 1504,
+1441, 1378, 1315, 1252, 1189, 1126, 1063, 1000,  937,  874,  811,  748,  685,
+ 622,  559,  496,  433,  370,  307,  244,  181,  118,   55, 2009, 1946, 1883,
+1820, 1757, 1694, 1631, 1568, 1505, 1442, 1379, 1316, 1253, 1190, 1127, 1064,
+1001,  938,  875,  812,  749,  686,  623,  560,  497,  434,  371,  308,  245,
+ 182,  119,   56, 2010, 1947, 1884, 1821, 1758, 1695, 1632, 1569, 1506, 1443,
+1380, 1317, 1254, 1191, 1128, 1065, 1002,  939,  876,  813,  750,  687,  624,
+ 561,  498,  435,  372,  309,  246,  183,  120,   57, 2011, 1948, 1885, 1822,
+1759, 1696, 1633, 1570, 1507, 1444, 1381, 1318, 1255, 1192, 1129, 1066, 1003,
+ 940,  877,  814,  751,  688,  625,  562,  499,  436,  373,  310,  247,  184,
+ 121,   58, 2012, 1949, 1886, 1823, 1760, 1697, 1634, 1571, 1508, 1445, 1382,
+1319, 1256, 1193, 1130, 1067, 1004,  941,  878,  815,  752,  689,  626,  563,
+ 500,  437,  374,  311,  248,  185,  122,   59, 2013, 1950, 1887, 1824, 1761,
+1698, 1635, 1572, 1509, 1446, 1383, 1320, 1257, 1194, 1131, 1068, 1005,  942,
+ 879,  816,  753,  690,  627,  564,  501,  438,  375,  312,  249,  186,  123,
+  60, 2014, 1951, 1888, 1825, 1762, 1699, 1636, 1573, 1510, 1447, 1384, 1321,
+1258, 1195, 1132, 1069, 1006,  943,  880,  817,  754,  691,  628,  565,  502,
+ 439,  376,  313,  250,  187,  124,   61, 2015, 1952, 1889, 1826, 1763, 1700,
+1637, 1574, 1511, 1448, 1385, 1322, 1259, 1196, 1133, 1070, 1007,  944,  881,
+ 818,  755,  692,  629,  566,  503,  440,  377,  314,  251,  188,  125,   62,
+2016, 1953, 1890, 1827, 1764, 1701, 1638, 1575, 1512, 1449, 1386, 1323, 1260,
+1197, 1134, 1071, 1008,  945,  882,  819,  756,  693,  630,  567,  504,  441,
+ 378,  315,  252,  189,  126,   63, 2017, 1954, 1891, 1828, 1765, 1702, 1639,
+1576, 1513, 1450, 1387, 1324, 1261, 1198, 1135, 1072, 1009,  946,  883,  820,
+ 757,  694,  631,  568,  505,  442,  379,  316,  253,  190,  127, 2018, 1955,
+1892, 1829, 1766, 1703, 1640, 1577, 1514, 1451, 1388, 1325, 1262, 1199, 1136,
+1073, 1010,  947,  884,  821,  758,  695,  632,  569,  506,  443,  380,  317,
+ 254,  191, 2019, 1956, 1893, 1830, 1767, 1704, 1641, 1578, 1515, 1452, 1389,
+1326, 1263, 1200, 1137, 1074, 1011,  948,  885,  822,  759,  696,  633,  570,
+ 507,  444,  381,  318,  255, 2020, 1957, 1894, 1831, 1768, 1705, 1642, 1579,
+1516, 1453, 1390, 1327, 1264, 1201, 1138, 1075, 1012,  949,  886,  823,  760,
+ 697,  634,  571,  508,  445,  382,  319, 2021, 1958, 1895, 1832, 1769, 1706,
+1643, 1580, 1517, 1454, 1391, 1328, 1265, 1202, 1139, 1076, 1013,  950,  887,
+ 824,  761,  698,  635,  572,  509,  446,  383, 2022, 1959, 1896, 1833, 1770,
+1707, 1644, 1581, 1518, 1455, 1392, 1329, 1266, 1203, 1140, 1077, 1014,  951,
+ 888,  825,  762,  699,  636,  573,  510,  447, 2023, 1960, 1897, 1834, 1771,
+1708, 1645, 1582, 1519, 1456, 1393, 1330, 1267, 1204, 1141, 1078, 1015,  952,
+ 889,  826,  763,  700,  637,  574,  511, 2024, 1961, 1898, 1835, 1772, 1709,
+1646, 1583, 1520, 1457, 1394, 1331, 1268, 1205, 1142, 1079, 1016,  953,  890,
+ 827,  764,  701,  638,  575, 2025, 1962, 1899, 1836, 1773, 1710, 1647, 1584,
+1521, 1458, 1395, 1332, 1269, 1206, 1143, 1080, 1017,  954,  891,  828,  765,
+ 702,  639, 2026, 1963, 1900, 1837, 1774, 1711, 1648, 1585, 1522, 1459, 1396,
+1333, 1270, 1207, 1144, 1081, 1018,  955,  892,  829,  766,  703, 2027, 1964,
+1901, 1838, 1775, 1712, 1649, 1586, 1523, 1460, 1397, 1334, 1271, 1208, 1145,
+1082, 1019,  956,  893,  830,  767, 2028, 1965, 1902, 1839, 1776, 1713, 1650,
+1587, 1524, 1461, 1398, 1335, 1272, 1209, 1146, 1083, 1020,  957,  894,  831,
+2029, 1966, 1903, 1840, 1777, 1714, 1651, 1588, 1525, 1462, 1399, 1336, 1273,
+1210, 1147, 1084, 1021,  958,  895, 2030, 1967, 1904, 1841, 1778, 1715, 1652,
+1589, 1526, 1463, 1400, 1337, 1274, 1211, 1148, 1085, 1022,  959, 2031, 1968,
+1905, 1842, 1779, 1716, 1653, 1590, 1527, 1464, 1401, 1338, 1275, 1212, 1149,
+1086, 1023, 2032, 1969, 1906, 1843, 1780, 1717, 1654, 1591, 1528, 1465, 1402,
+1339, 1276, 1213, 1150, 1087, 2033, 1970, 1907, 1844, 1781, 1718, 1655, 1592,
+1529, 1466, 1403, 1340, 1277, 1214, 1151, 2034, 1971, 1908, 1845, 1782, 1719,
+1656, 1593, 1530, 1467, 1404, 1341, 1278, 1215, 2035, 1972, 1909, 1846, 1783,
+1720, 1657, 1594, 1531, 1468, 1405, 1342, 1279, 2036, 1973, 1910, 1847, 1784,
+1721, 1658, 1595, 1532, 1469, 1406, 1343, 2037, 1974, 1911, 1848, 1785, 1722,
+1659, 1596, 1533, 1470, 1407, 2038, 1975, 1912, 1849, 1786, 1723, 1660, 1597,
+1534, 1471, 2039, 1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 2040, 1977,
+1914, 1851, 1788, 1725, 1662, 1599, 2041, 1978, 1915, 1852, 1789, 1726, 1663,
+2042, 1979, 1916, 1853, 1790, 1727, 2043, 1980, 1917, 1854, 1791, 2044, 1981,
+1918, 1855, 2045, 1982, 1919, 2046, 1983, 2047,    0,   64,    1,  128,   65, // 64x64
+   2,  192,  129,   66,    3,  256,  193,  130,   67,    4,  320,  257,  194,
+ 131,   68,    5,  384,  321,  258,  195,  132,   69,    6,  448,  385,  322,
+ 259,  196,  133,   70,    7,  512,  449,  386,  323,  260,  197,  134,   71,
+   8,  576,  513,  450,  387,  324,  261,  198,  135,   72,    9,  640,  577,
+ 514,  451,  388,  325,  262,  199,  136,   73,   10,  704,  641,  578,  515,
+ 452,  389,  326,  263,  200,  137,   74,   11,  768,  705,  642,  579,  516,
+ 453,  390,  327,  264,  201,  138,   75,   12,  832,  769,  706,  643,  580,
+ 517,  454,  391,  328,  265,  202,  139,   76,   13,  896,  833,  770,  707,
+ 644,  581,  518,  455,  392,  329,  266,  203,  140,   77,   14,  960,  897,
+ 834,  771,  708,  645,  582,  519,  456,  393,  330,  267,  204,  141,   78,
+  15, 1024,  961,  898,  835,  772,  709,  646,  583,  520,  457,  394,  331,
+ 268,  205,  142,   79,   16, 1088, 1025,  962,  899,  836,  773,  710,  647,
+ 584,  521,  458,  395,  332,  269,  206,  143,   80,   17, 1152, 1089, 1026,
+ 963,  900,  837,  774,  711,  648,  585,  522,  459,  396,  333,  270,  207,
+ 144,   81,   18, 1216, 1153, 1090, 1027,  964,  901,  838,  775,  712,  649,
+ 586,  523,  460,  397,  334,  271,  208,  145,   82,   19, 1280, 1217, 1154,
+1091, 1028,  965,  902,  839,  776,  713,  650,  587,  524,  461,  398,  335,
+ 272,  209,  146,   83,   20, 1344, 1281, 1218, 1155, 1092, 1029,  966,  903,
+ 840,  777,  714,  651,  588,  525,  462,  399,  336,  273,  210,  147,   84,
+  21, 1408, 1345, 1282, 1219, 1156, 1093, 1030,  967,  904,  841,  778,  715,
+ 652,  589,  526,  463,  400,  337,  274,  211,  148,   85,   22, 1472, 1409,
+1346, 1283, 1220, 1157, 1094, 1031,  968,  905,  842,  779,  716,  653,  590,
+ 527,  464,  401,  338,  275,  212,  149,   86,   23, 1536, 1473, 1410, 1347,
+1284, 1221, 1158, 1095, 1032,  969,  906,  843,  780,  717,  654,  591,  528,
+ 465,  402,  339,  276,  213,  150,   87,   24, 1600, 1537, 1474, 1411, 1348,
+1285, 1222, 1159, 1096, 1033,  970,  907,  844,  781,  718,  655,  592,  529,
+ 466,  403,  340,  277,  214,  151,   88,   25, 1664, 1601, 1538, 1475, 1412,
+1349, 1286, 1223, 1160, 1097, 1034,  971,  908,  845,  782,  719,  656,  593,
+ 530,  467,  404,  341,  278,  215,  152,   89,   26, 1728, 1665, 1602, 1539,
+1476, 1413, 1350, 1287, 1224, 1161, 1098, 1035,  972,  909,  846,  783,  720,
+ 657,  594,  531,  468,  405,  342,  279,  216,  153,   90,   27, 1792, 1729,
+1666, 1603, 1540, 1477, 1414, 1351, 1288, 1225, 1162, 1099, 1036,  973,  910,
+ 847,  784,  721,  658,  595,  532,  469,  406,  343,  280,  217,  154,   91,
+  28, 1856, 1793, 1730, 1667, 1604, 1541, 1478, 1415, 1352, 1289, 1226, 1163,
+1100, 1037,  974,  911,  848,  785,  722,  659,  596,  533,  470,  407,  344,
+ 281,  218,  155,   92,   29, 1920, 1857, 1794, 1731, 1668, 1605, 1542, 1479,
+1416, 1353, 1290, 1227, 1164, 1101, 1038,  975,  912,  849,  786,  723,  660,
+ 597,  534,  471,  408,  345,  282,  219,  156,   93,   30, 1984, 1921, 1858,
+1795, 1732, 1669, 1606, 1543, 1480, 1417, 1354, 1291, 1228, 1165, 1102, 1039,
+ 976,  913,  850,  787,  724,  661,  598,  535,  472,  409,  346,  283,  220,
+ 157,   94,   31, 2048, 1985, 1922, 1859, 1796, 1733, 1670, 1607, 1544, 1481,
+1418, 1355, 1292, 1229, 1166, 1103, 1040,  977,  914,  851,  788,  725,  662,
+ 599,  536,  473,  410,  347,  284,  221,  158,   95,   32, 2112, 2049, 1986,
+1923, 1860, 1797, 1734, 1671, 1608, 1545, 1482, 1419, 1356, 1293, 1230, 1167,
+1104, 1041,  978,  915,  852,  789,  726,  663,  600,  537,  474,  411,  348,
+ 285,  222,  159,   96,   33, 2176, 2113, 2050, 1987, 1924, 1861, 1798, 1735,
+1672, 1609, 1546, 1483, 1420, 1357, 1294, 1231, 1168, 1105, 1042,  979,  916,
+ 853,  790,  727,  664,  601,  538,  475,  412,  349,  286,  223,  160,   97,
+  34, 2240, 2177, 2114, 2051, 1988, 1925, 1862, 1799, 1736, 1673, 1610, 1547,
+1484, 1421, 1358, 1295, 1232, 1169, 1106, 1043,  980,  917,  854,  791,  728,
+ 665,  602,  539,  476,  413,  350,  287,  224,  161,   98,   35, 2304, 2241,
+2178, 2115, 2052, 1989, 1926, 1863, 1800, 1737, 1674, 1611, 1548, 1485, 1422,
+1359, 1296, 1233, 1170, 1107, 1044,  981,  918,  855,  792,  729,  666,  603,
+ 540,  477,  414,  351,  288,  225,  162,   99,   36, 2368, 2305, 2242, 2179,
+2116, 2053, 1990, 1927, 1864, 1801, 1738, 1675, 1612, 1549, 1486, 1423, 1360,
+1297, 1234, 1171, 1108, 1045,  982,  919,  856,  793,  730,  667,  604,  541,
+ 478,  415,  352,  289,  226,  163,  100,   37, 2432, 2369, 2306, 2243, 2180,
+2117, 2054, 1991, 1928, 1865, 1802, 1739, 1676, 1613, 1550, 1487, 1424, 1361,
+1298, 1235, 1172, 1109, 1046,  983,  920,  857,  794,  731,  668,  605,  542,
+ 479,  416,  353,  290,  227,  164,  101,   38, 2496, 2433, 2370, 2307, 2244,
+2181, 2118, 2055, 1992, 1929, 1866, 1803, 1740, 1677, 1614, 1551, 1488, 1425,
+1362, 1299, 1236, 1173, 1110, 1047,  984,  921,  858,  795,  732,  669,  606,
+ 543,  480,  417,  354,  291,  228,  165,  102,   39, 2560, 2497, 2434, 2371,
+2308, 2245, 2182, 2119, 2056, 1993, 1930, 1867, 1804, 1741, 1678, 1615, 1552,
+1489, 1426, 1363, 1300, 1237, 1174, 1111, 1048,  985,  922,  859,  796,  733,
+ 670,  607,  544,  481,  418,  355,  292,  229,  166,  103,   40, 2624, 2561,
+2498, 2435, 2372, 2309, 2246, 2183, 2120, 2057, 1994, 1931, 1868, 1805, 1742,
+1679, 1616, 1553, 1490, 1427, 1364, 1301, 1238, 1175, 1112, 1049,  986,  923,
+ 860,  797,  734,  671,  608,  545,  482,  419,  356,  293,  230,  167,  104,
+  41, 2688, 2625, 2562, 2499, 2436, 2373, 2310, 2247, 2184, 2121, 2058, 1995,
+1932, 1869, 1806, 1743, 1680, 1617, 1554, 1491, 1428, 1365, 1302, 1239, 1176,
+1113, 1050,  987,  924,  861,  798,  735,  672,  609,  546,  483,  420,  357,
+ 294,  231,  168,  105,   42, 2752, 2689, 2626, 2563, 2500, 2437, 2374, 2311,
+2248, 2185, 2122, 2059, 1996, 1933, 1870, 1807, 1744, 1681, 1618, 1555, 1492,
+1429, 1366, 1303, 1240, 1177, 1114, 1051,  988,  925,  862,  799,  736,  673,
+ 610,  547,  484,  421,  358,  295,  232,  169,  106,   43, 2816, 2753, 2690,
+2627, 2564, 2501, 2438, 2375, 2312, 2249, 2186, 2123, 2060, 1997, 1934, 1871,
+1808, 1745, 1682, 1619, 1556, 1493, 1430, 1367, 1304, 1241, 1178, 1115, 1052,
+ 989,  926,  863,  800,  737,  674,  611,  548,  485,  422,  359,  296,  233,
+ 170,  107,   44, 2880, 2817, 2754, 2691, 2628, 2565, 2502, 2439, 2376, 2313,
+2250, 2187, 2124, 2061, 1998, 1935, 1872, 1809, 1746, 1683, 1620, 1557, 1494,
+1431, 1368, 1305, 1242, 1179, 1116, 1053,  990,  927,  864,  801,  738,  675,
+ 612,  549,  486,  423,  360,  297,  234,  171,  108,   45, 2944, 2881, 2818,
+2755, 2692, 2629, 2566, 2503, 2440, 2377, 2314, 2251, 2188, 2125, 2062, 1999,
+1936, 1873, 1810, 1747, 1684, 1621, 1558, 1495, 1432, 1369, 1306, 1243, 1180,
+1117, 1054,  991,  928,  865,  802,  739,  676,  613,  550,  487,  424,  361,
+ 298,  235,  172,  109,   46, 3008, 2945, 2882, 2819, 2756, 2693, 2630, 2567,
+2504, 2441, 2378, 2315, 2252, 2189, 2126, 2063, 2000, 1937, 1874, 1811, 1748,
+1685, 1622, 1559, 1496, 1433, 1370, 1307, 1244, 1181, 1118, 1055,  992,  929,
+ 866,  803,  740,  677,  614,  551,  488,  425,  362,  299,  236,  173,  110,
+  47, 3072, 3009, 2946, 2883, 2820, 2757, 2694, 2631, 2568, 2505, 2442, 2379,
+2316, 2253, 2190, 2127, 2064, 2001, 1938, 1875, 1812, 1749, 1686, 1623, 1560,
+1497, 1434, 1371, 1308, 1245, 1182, 1119, 1056,  993,  930,  867,  804,  741,
+ 678,  615,  552,  489,  426,  363,  300,  237,  174,  111,   48, 3136, 3073,
+3010, 2947, 2884, 2821, 2758, 2695, 2632, 2569, 2506, 2443, 2380, 2317, 2254,
+2191, 2128, 2065, 2002, 1939, 1876, 1813, 1750, 1687, 1624, 1561, 1498, 1435,
+1372, 1309, 1246, 1183, 1120, 1057,  994,  931,  868,  805,  742,  679,  616,
+ 553,  490,  427,  364,  301,  238,  175,  112,   49, 3200, 3137, 3074, 3011,
+2948, 2885, 2822, 2759, 2696, 2633, 2570, 2507, 2444, 2381, 2318, 2255, 2192,
+2129, 2066, 2003, 1940, 1877, 1814, 1751, 1688, 1625, 1562, 1499, 1436, 1373,
+1310, 1247, 1184, 1121, 1058,  995,  932,  869,  806,  743,  680,  617,  554,
+ 491,  428,  365,  302,  239,  176,  113,   50, 3264, 3201, 3138, 3075, 3012,
+2949, 2886, 2823, 2760, 2697, 2634, 2571, 2508, 2445, 2382, 2319, 2256, 2193,
+2130, 2067, 2004, 1941, 1878, 1815, 1752, 1689, 1626, 1563, 1500, 1437, 1374,
+1311, 1248, 1185, 1122, 1059,  996,  933,  870,  807,  744,  681,  618,  555,
+ 492,  429,  366,  303,  240,  177,  114,   51, 3328, 3265, 3202, 3139, 3076,
+3013, 2950, 2887, 2824, 2761, 2698, 2635, 2572, 2509, 2446, 2383, 2320, 2257,
+2194, 2131, 2068, 2005, 1942, 1879, 1816, 1753, 1690, 1627, 1564, 1501, 1438,
+1375, 1312, 1249, 1186, 1123, 1060,  997,  934,  871,  808,  745,  682,  619,
+ 556,  493,  430,  367,  304,  241,  178,  115,   52, 3392, 3329, 3266, 3203,
+3140, 3077, 3014, 2951, 2888, 2825, 2762, 2699, 2636, 2573, 2510, 2447, 2384,
+2321, 2258, 2195, 2132, 2069, 2006, 1943, 1880, 1817, 1754, 1691, 1628, 1565,
+1502, 1439, 1376, 1313, 1250, 1187, 1124, 1061,  998,  935,  872,  809,  746,
+ 683,  620,  557,  494,  431,  368,  305,  242,  179,  116,   53, 3456, 3393,
+3330, 3267, 3204, 3141, 3078, 3015, 2952, 2889, 2826, 2763, 2700, 2637, 2574,
+2511, 2448, 2385, 2322, 2259, 2196, 2133, 2070, 2007, 1944, 1881, 1818, 1755,
+1692, 1629, 1566, 1503, 1440, 1377, 1314, 1251, 1188, 1125, 1062,  999,  936,
+ 873,  810,  747,  684,  621,  558,  495,  432,  369,  306,  243,  180,  117,
+  54, 3520, 3457, 3394, 3331, 3268, 3205, 3142, 3079, 3016, 2953, 2890, 2827,
+2764, 2701, 2638, 2575, 2512, 2449, 2386, 2323, 2260, 2197, 2134, 2071, 2008,
+1945, 1882, 1819, 1756, 1693, 1630, 1567, 1504, 1441, 1378, 1315, 1252, 1189,
+1126, 1063, 1000,  937,  874,  811,  748,  685,  622,  559,  496,  433,  370,
+ 307,  244,  181,  118,   55, 3584, 3521, 3458, 3395, 3332, 3269, 3206, 3143,
+3080, 3017, 2954, 2891, 2828, 2765, 2702, 2639, 2576, 2513, 2450, 2387, 2324,
+2261, 2198, 2135, 2072, 2009, 1946, 1883, 1820, 1757, 1694, 1631, 1568, 1505,
+1442, 1379, 1316, 1253, 1190, 1127, 1064, 1001,  938,  875,  812,  749,  686,
+ 623,  560,  497,  434,  371,  308,  245,  182,  119,   56, 3648, 3585, 3522,
+3459, 3396, 3333, 3270, 3207, 3144, 3081, 3018, 2955, 2892, 2829, 2766, 2703,
+2640, 2577, 2514, 2451, 2388, 2325, 2262, 2199, 2136, 2073, 2010, 1947, 1884,
+1821, 1758, 1695, 1632, 1569, 1506, 1443, 1380, 1317, 1254, 1191, 1128, 1065,
+1002,  939,  876,  813,  750,  687,  624,  561,  498,  435,  372,  309,  246,
+ 183,  120,   57, 3712, 3649, 3586, 3523, 3460, 3397, 3334, 3271, 3208, 3145,
+3082, 3019, 2956, 2893, 2830, 2767, 2704, 2641, 2578, 2515, 2452, 2389, 2326,
+2263, 2200, 2137, 2074, 2011, 1948, 1885, 1822, 1759, 1696, 1633, 1570, 1507,
+1444, 1381, 1318, 1255, 1192, 1129, 1066, 1003,  940,  877,  814,  751,  688,
+ 625,  562,  499,  436,  373,  310,  247,  184,  121,   58, 3776, 3713, 3650,
+3587, 3524, 3461, 3398, 3335, 3272, 3209, 3146, 3083, 3020, 2957, 2894, 2831,
+2768, 2705, 2642, 2579, 2516, 2453, 2390, 2327, 2264, 2201, 2138, 2075, 2012,
+1949, 1886, 1823, 1760, 1697, 1634, 1571, 1508, 1445, 1382, 1319, 1256, 1193,
+1130, 1067, 1004,  941,  878,  815,  752,  689,  626,  563,  500,  437,  374,
+ 311,  248,  185,  122,   59, 3840, 3777, 3714, 3651, 3588, 3525, 3462, 3399,
+3336, 3273, 3210, 3147, 3084, 3021, 2958, 2895, 2832, 2769, 2706, 2643, 2580,
+2517, 2454, 2391, 2328, 2265, 2202, 2139, 2076, 2013, 1950, 1887, 1824, 1761,
+1698, 1635, 1572, 1509, 1446, 1383, 1320, 1257, 1194, 1131, 1068, 1005,  942,
+ 879,  816,  753,  690,  627,  564,  501,  438,  375,  312,  249,  186,  123,
+  60, 3904, 3841, 3778, 3715, 3652, 3589, 3526, 3463, 3400, 3337, 3274, 3211,
+3148, 3085, 3022, 2959, 2896, 2833, 2770, 2707, 2644, 2581, 2518, 2455, 2392,
+2329, 2266, 2203, 2140, 2077, 2014, 1951, 1888, 1825, 1762, 1699, 1636, 1573,
+1510, 1447, 1384, 1321, 1258, 1195, 1132, 1069, 1006,  943,  880,  817,  754,
+ 691,  628,  565,  502,  439,  376,  313,  250,  187,  124,   61, 3968, 3905,
+3842, 3779, 3716, 3653, 3590, 3527, 3464, 3401, 3338, 3275, 3212, 3149, 3086,
+3023, 2960, 2897, 2834, 2771, 2708, 2645, 2582, 2519, 2456, 2393, 2330, 2267,
+2204, 2141, 2078, 2015, 1952, 1889, 1826, 1763, 1700, 1637, 1574, 1511, 1448,
+1385, 1322, 1259, 1196, 1133, 1070, 1007,  944,  881,  818,  755,  692,  629,
+ 566,  503,  440,  377,  314,  251,  188,  125,   62, 4032, 3969, 3906, 3843,
+3780, 3717, 3654, 3591, 3528, 3465, 3402, 3339, 3276, 3213, 3150, 3087, 3024,
+2961, 2898, 2835, 2772, 2709, 2646, 2583, 2520, 2457, 2394, 2331, 2268, 2205,
+2142, 2079, 2016, 1953, 1890, 1827, 1764, 1701, 1638, 1575, 1512, 1449, 1386,
+1323, 1260, 1197, 1134, 1071, 1008,  945,  882,  819,  756,  693,  630,  567,
+ 504,  441,  378,  315,  252,  189,  126,   63, 4033, 3970, 3907, 3844, 3781,
+3718, 3655, 3592, 3529, 3466, 3403, 3340, 3277, 3214, 3151, 3088, 3025, 2962,
+2899, 2836, 2773, 2710, 2647, 2584, 2521, 2458, 2395, 2332, 2269, 2206, 2143,
+2080, 2017, 1954, 1891, 1828, 1765, 1702, 1639, 1576, 1513, 1450, 1387, 1324,
+1261, 1198, 1135, 1072, 1009,  946,  883,  820,  757,  694,  631,  568,  505,
+ 442,  379,  316,  253,  190,  127, 4034, 3971, 3908, 3845, 3782, 3719, 3656,
+3593, 3530, 3467, 3404, 3341, 3278, 3215, 3152, 3089, 3026, 2963, 2900, 2837,
+2774, 2711, 2648, 2585, 2522, 2459, 2396, 2333, 2270, 2207, 2144, 2081, 2018,
+1955, 1892, 1829, 1766, 1703, 1640, 1577, 1514, 1451, 1388, 1325, 1262, 1199,
+1136, 1073, 1010,  947,  884,  821,  758,  695,  632,  569,  506,  443,  380,
+ 317,  254,  191, 4035, 3972, 3909, 3846, 3783, 3720, 3657, 3594, 3531, 3468,
+3405, 3342, 3279, 3216, 3153, 3090, 3027, 2964, 2901, 2838, 2775, 2712, 2649,
+2586, 2523, 2460, 2397, 2334, 2271, 2208, 2145, 2082, 2019, 1956, 1893, 1830,
+1767, 1704, 1641, 1578, 1515, 1452, 1389, 1326, 1263, 1200, 1137, 1074, 1011,
+ 948,  885,  822,  759,  696,  633,  570,  507,  444,  381,  318,  255, 4036,
+3973, 3910, 3847, 3784, 3721, 3658, 3595, 3532, 3469, 3406, 3343, 3280, 3217,
+3154, 3091, 3028, 2965, 2902, 2839, 2776, 2713, 2650, 2587, 2524, 2461, 2398,
+2335, 2272, 2209, 2146, 2083, 2020, 1957, 1894, 1831, 1768, 1705, 1642, 1579,
+1516, 1453, 1390, 1327, 1264, 1201, 1138, 1075, 1012,  949,  886,  823,  760,
+ 697,  634,  571,  508,  445,  382,  319, 4037, 3974, 3911, 3848, 3785, 3722,
+3659, 3596, 3533, 3470, 3407, 3344, 3281, 3218, 3155, 3092, 3029, 2966, 2903,
+2840, 2777, 2714, 2651, 2588, 2525, 2462, 2399, 2336, 2273, 2210, 2147, 2084,
+2021, 1958, 1895, 1832, 1769, 1706, 1643, 1580, 1517, 1454, 1391, 1328, 1265,
+1202, 1139, 1076, 1013,  950,  887,  824,  761,  698,  635,  572,  509,  446,
+ 383, 4038, 3975, 3912, 3849, 3786, 3723, 3660, 3597, 3534, 3471, 3408, 3345,
+3282, 3219, 3156, 3093, 3030, 2967, 2904, 2841, 2778, 2715, 2652, 2589, 2526,
+2463, 2400, 2337, 2274, 2211, 2148, 2085, 2022, 1959, 1896, 1833, 1770, 1707,
+1644, 1581, 1518, 1455, 1392, 1329, 1266, 1203, 1140, 1077, 1014,  951,  888,
+ 825,  762,  699,  636,  573,  510,  447, 4039, 3976, 3913, 3850, 3787, 3724,
+3661, 3598, 3535, 3472, 3409, 3346, 3283, 3220, 3157, 3094, 3031, 2968, 2905,
+2842, 2779, 2716, 2653, 2590, 2527, 2464, 2401, 2338, 2275, 2212, 2149, 2086,
+2023, 1960, 1897, 1834, 1771, 1708, 1645, 1582, 1519, 1456, 1393, 1330, 1267,
+1204, 1141, 1078, 1015,  952,  889,  826,  763,  700,  637,  574,  511, 4040,
+3977, 3914, 3851, 3788, 3725, 3662, 3599, 3536, 3473, 3410, 3347, 3284, 3221,
+3158, 3095, 3032, 2969, 2906, 2843, 2780, 2717, 2654, 2591, 2528, 2465, 2402,
+2339, 2276, 2213, 2150, 2087, 2024, 1961, 1898, 1835, 1772, 1709, 1646, 1583,
+1520, 1457, 1394, 1331, 1268, 1205, 1142, 1079, 1016,  953,  890,  827,  764,
+ 701,  638,  575, 4041, 3978, 3915, 3852, 3789, 3726, 3663, 3600, 3537, 3474,
+3411, 3348, 3285, 3222, 3159, 3096, 3033, 2970, 2907, 2844, 2781, 2718, 2655,
+2592, 2529, 2466, 2403, 2340, 2277, 2214, 2151, 2088, 2025, 1962, 1899, 1836,
+1773, 1710, 1647, 1584, 1521, 1458, 1395, 1332, 1269, 1206, 1143, 1080, 1017,
+ 954,  891,  828,  765,  702,  639, 4042, 3979, 3916, 3853, 3790, 3727, 3664,
+3601, 3538, 3475, 3412, 3349, 3286, 3223, 3160, 3097, 3034, 2971, 2908, 2845,
+2782, 2719, 2656, 2593, 2530, 2467, 2404, 2341, 2278, 2215, 2152, 2089, 2026,
+1963, 1900, 1837, 1774, 1711, 1648, 1585, 1522, 1459, 1396, 1333, 1270, 1207,
+1144, 1081, 1018,  955,  892,  829,  766,  703, 4043, 3980, 3917, 3854, 3791,
+3728, 3665, 3602, 3539, 3476, 3413, 3350, 3287, 3224, 3161, 3098, 3035, 2972,
+2909, 2846, 2783, 2720, 2657, 2594, 2531, 2468, 2405, 2342, 2279, 2216, 2153,
+2090, 2027, 1964, 1901, 1838, 1775, 1712, 1649, 1586, 1523, 1460, 1397, 1334,
+1271, 1208, 1145, 1082, 1019,  956,  893,  830,  767, 4044, 3981, 3918, 3855,
+3792, 3729, 3666, 3603, 3540, 3477, 3414, 3351, 3288, 3225, 3162, 3099, 3036,
+2973, 2910, 2847, 2784, 2721, 2658, 2595, 2532, 2469, 2406, 2343, 2280, 2217,
+2154, 2091, 2028, 1965, 1902, 1839, 1776, 1713, 1650, 1587, 1524, 1461, 1398,
+1335, 1272, 1209, 1146, 1083, 1020,  957,  894,  831, 4045, 3982, 3919, 3856,
+3793, 3730, 3667, 3604, 3541, 3478, 3415, 3352, 3289, 3226, 3163, 3100, 3037,
+2974, 2911, 2848, 2785, 2722, 2659, 2596, 2533, 2470, 2407, 2344, 2281, 2218,
+2155, 2092, 2029, 1966, 1903, 1840, 1777, 1714, 1651, 1588, 1525, 1462, 1399,
+1336, 1273, 1210, 1147, 1084, 1021,  958,  895, 4046, 3983, 3920, 3857, 3794,
+3731, 3668, 3605, 3542, 3479, 3416, 3353, 3290, 3227, 3164, 3101, 3038, 2975,
+2912, 2849, 2786, 2723, 2660, 2597, 2534, 2471, 2408, 2345, 2282, 2219, 2156,
+2093, 2030, 1967, 1904, 1841, 1778, 1715, 1652, 1589, 1526, 1463, 1400, 1337,
+1274, 1211, 1148, 1085, 1022,  959, 4047, 3984, 3921, 3858, 3795, 3732, 3669,
+3606, 3543, 3480, 3417, 3354, 3291, 3228, 3165, 3102, 3039, 2976, 2913, 2850,
+2787, 2724, 2661, 2598, 2535, 2472, 2409, 2346, 2283, 2220, 2157, 2094, 2031,
+1968, 1905, 1842, 1779, 1716, 1653, 1590, 1527, 1464, 1401, 1338, 1275, 1212,
+1149, 1086, 1023, 4048, 3985, 3922, 3859, 3796, 3733, 3670, 3607, 3544, 3481,
+3418, 3355, 3292, 3229, 3166, 3103, 3040, 2977, 2914, 2851, 2788, 2725, 2662,
+2599, 2536, 2473, 2410, 2347, 2284, 2221, 2158, 2095, 2032, 1969, 1906, 1843,
+1780, 1717, 1654, 1591, 1528, 1465, 1402, 1339, 1276, 1213, 1150, 1087, 4049,
+3986, 3923, 3860, 3797, 3734, 3671, 3608, 3545, 3482, 3419, 3356, 3293, 3230,
+3167, 3104, 3041, 2978, 2915, 2852, 2789, 2726, 2663, 2600, 2537, 2474, 2411,
+2348, 2285, 2222, 2159, 2096, 2033, 1970, 1907, 1844, 1781, 1718, 1655, 1592,
+1529, 1466, 1403, 1340, 1277, 1214, 1151, 4050, 3987, 3924, 3861, 3798, 3735,
+3672, 3609, 3546, 3483, 3420, 3357, 3294, 3231, 3168, 3105, 3042, 2979, 2916,
+2853, 2790, 2727, 2664, 2601, 2538, 2475, 2412, 2349, 2286, 2223, 2160, 2097,
+2034, 1971, 1908, 1845, 1782, 1719, 1656, 1593, 1530, 1467, 1404, 1341, 1278,
+1215, 4051, 3988, 3925, 3862, 3799, 3736, 3673, 3610, 3547, 3484, 3421, 3358,
+3295, 3232, 3169, 3106, 3043, 2980, 2917, 2854, 2791, 2728, 2665, 2602, 2539,
+2476, 2413, 2350, 2287, 2224, 2161, 2098, 2035, 1972, 1909, 1846, 1783, 1720,
+1657, 1594, 1531, 1468, 1405, 1342, 1279, 4052, 3989, 3926, 3863, 3800, 3737,
+3674, 3611, 3548, 3485, 3422, 3359, 3296, 3233, 3170, 3107, 3044, 2981, 2918,
+2855, 2792, 2729, 2666, 2603, 2540, 2477, 2414, 2351, 2288, 2225, 2162, 2099,
+2036, 1973, 1910, 1847, 1784, 1721, 1658, 1595, 1532, 1469, 1406, 1343, 4053,
+3990, 3927, 3864, 3801, 3738, 3675, 3612, 3549, 3486, 3423, 3360, 3297, 3234,
+3171, 3108, 3045, 2982, 2919, 2856, 2793, 2730, 2667, 2604, 2541, 2478, 2415,
+2352, 2289, 2226, 2163, 2100, 2037, 1974, 1911, 1848, 1785, 1722, 1659, 1596,
+1533, 1470, 1407, 4054, 3991, 3928, 3865, 3802, 3739, 3676, 3613, 3550, 3487,
+3424, 3361, 3298, 3235, 3172, 3109, 3046, 2983, 2920, 2857, 2794, 2731, 2668,
+2605, 2542, 2479, 2416, 2353, 2290, 2227, 2164, 2101, 2038, 1975, 1912, 1849,
+1786, 1723, 1660, 1597, 1534, 1471, 4055, 3992, 3929, 3866, 3803, 3740, 3677,
+3614, 3551, 3488, 3425, 3362, 3299, 3236, 3173, 3110, 3047, 2984, 2921, 2858,
+2795, 2732, 2669, 2606, 2543, 2480, 2417, 2354, 2291, 2228, 2165, 2102, 2039,
+1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 4056, 3993, 3930, 3867, 3804,
+3741, 3678, 3615, 3552, 3489, 3426, 3363, 3300, 3237, 3174, 3111, 3048, 2985,
+2922, 2859, 2796, 2733, 2670, 2607, 2544, 2481, 2418, 2355, 2292, 2229, 2166,
+2103, 2040, 1977, 1914, 1851, 1788, 1725, 1662, 1599, 4057, 3994, 3931, 3868,
+3805, 3742, 3679, 3616, 3553, 3490, 3427, 3364, 3301, 3238, 3175, 3112, 3049,
+2986, 2923, 2860, 2797, 2734, 2671, 2608, 2545, 2482, 2419, 2356, 2293, 2230,
+2167, 2104, 2041, 1978, 1915, 1852, 1789, 1726, 1663, 4058, 3995, 3932, 3869,
+3806, 3743, 3680, 3617, 3554, 3491, 3428, 3365, 3302, 3239, 3176, 3113, 3050,
+2987, 2924, 2861, 2798, 2735, 2672, 2609, 2546, 2483, 2420, 2357, 2294, 2231,
+2168, 2105, 2042, 1979, 1916, 1853, 1790, 1727, 4059, 3996, 3933, 3870, 3807,
+3744, 3681, 3618, 3555, 3492, 3429, 3366, 3303, 3240, 3177, 3114, 3051, 2988,
+2925, 2862, 2799, 2736, 2673, 2610, 2547, 2484, 2421, 2358, 2295, 2232, 2169,
+2106, 2043, 1980, 1917, 1854, 1791, 4060, 3997, 3934, 3871, 3808, 3745, 3682,
+3619, 3556, 3493, 3430, 3367, 3304, 3241, 3178, 3115, 3052, 2989, 2926, 2863,
+2800, 2737, 2674, 2611, 2548, 2485, 2422, 2359, 2296, 2233, 2170, 2107, 2044,
+1981, 1918, 1855, 4061, 3998, 3935, 3872, 3809, 3746, 3683, 3620, 3557, 3494,
+3431, 3368, 3305, 3242, 3179, 3116, 3053, 2990, 2927, 2864, 2801, 2738, 2675,
+2612, 2549, 2486, 2423, 2360, 2297, 2234, 2171, 2108, 2045, 1982, 1919, 4062,
+3999, 3936, 3873, 3810, 3747, 3684, 3621, 3558, 3495, 3432, 3369, 3306, 3243,
+3180, 3117, 3054, 2991, 2928, 2865, 2802, 2739, 2676, 2613, 2550, 2487, 2424,
+2361, 2298, 2235, 2172, 2109, 2046, 1983, 4063, 4000, 3937, 3874, 3811, 3748,
+3685, 3622, 3559, 3496, 3433, 3370, 3307, 3244, 3181, 3118, 3055, 2992, 2929,
+2866, 2803, 2740, 2677, 2614, 2551, 2488, 2425, 2362, 2299, 2236, 2173, 2110,
+2047, 4064, 4001, 3938, 3875, 3812, 3749, 3686, 3623, 3560, 3497, 3434, 3371,
+3308, 3245, 3182, 3119, 3056, 2993, 2930, 2867, 2804, 2741, 2678, 2615, 2552,
+2489, 2426, 2363, 2300, 2237, 2174, 2111, 4065, 4002, 3939, 3876, 3813, 3750,
+3687, 3624, 3561, 3498, 3435, 3372, 3309, 3246, 3183, 3120, 3057, 2994, 2931,
+2868, 2805, 2742, 2679, 2616, 2553, 2490, 2427, 2364, 2301, 2238, 2175, 4066,
+4003, 3940, 3877, 3814, 3751, 3688, 3625, 3562, 3499, 3436, 3373, 3310, 3247,
+3184, 3121, 3058, 2995, 2932, 2869, 2806, 2743, 2680, 2617, 2554, 2491, 2428,
+2365, 2302, 2239, 4067, 4004, 3941, 3878, 3815, 3752, 3689, 3626, 3563, 3500,
+3437, 3374, 3311, 3248, 3185, 3122, 3059, 2996, 2933, 2870, 2807, 2744, 2681,
+2618, 2555, 2492, 2429, 2366, 2303, 4068, 4005, 3942, 3879, 3816, 3753, 3690,
+3627, 3564, 3501, 3438, 3375, 3312, 3249, 3186, 3123, 3060, 2997, 2934, 2871,
+2808, 2745, 2682, 2619, 2556, 2493, 2430, 2367, 4069, 4006, 3943, 3880, 3817,
+3754, 3691, 3628, 3565, 3502, 3439, 3376, 3313, 3250, 3187, 3124, 3061, 2998,
+2935, 2872, 2809, 2746, 2683, 2620, 2557, 2494, 2431, 4070, 4007, 3944, 3881,
+3818, 3755, 3692, 3629, 3566, 3503, 3440, 3377, 3314, 3251, 3188, 3125, 3062,
+2999, 2936, 2873, 2810, 2747, 2684, 2621, 2558, 2495, 4071, 4008, 3945, 3882,
+3819, 3756, 3693, 3630, 3567, 3504, 3441, 3378, 3315, 3252, 3189, 3126, 3063,
+3000, 2937, 2874, 2811, 2748, 2685, 2622, 2559, 4072, 4009, 3946, 3883, 3820,
+3757, 3694, 3631, 3568, 3505, 3442, 3379, 3316, 3253, 3190, 3127, 3064, 3001,
+2938, 2875, 2812, 2749, 2686, 2623, 4073, 4010, 3947, 3884, 3821, 3758, 3695,
+3632, 3569, 3506, 3443, 3380, 3317, 3254, 3191, 3128, 3065, 3002, 2939, 2876,
+2813, 2750, 2687, 4074, 4011, 3948, 3885, 3822, 3759, 3696, 3633, 3570, 3507,
+3444, 3381, 3318, 3255, 3192, 3129, 3066, 3003, 2940, 2877, 2814, 2751, 4075,
+4012, 3949, 3886, 3823, 3760, 3697, 3634, 3571, 3508, 3445, 3382, 3319, 3256,
+3193, 3130, 3067, 3004, 2941, 2878, 2815, 4076, 4013, 3950, 3887, 3824, 3761,
+3698, 3635, 3572, 3509, 3446, 3383, 3320, 3257, 3194, 3131, 3068, 3005, 2942,
+2879, 4077, 4014, 3951, 3888, 3825, 3762, 3699, 3636, 3573, 3510, 3447, 3384,
+3321, 3258, 3195, 3132, 3069, 3006, 2943, 4078, 4015, 3952, 3889, 3826, 3763,
+3700, 3637, 3574, 3511, 3448, 3385, 3322, 3259, 3196, 3133, 3070, 3007, 4079,
+4016, 3953, 3890, 3827, 3764, 3701, 3638, 3575, 3512, 3449, 3386, 3323, 3260,
+3197, 3134, 3071, 4080, 4017, 3954, 3891, 3828, 3765, 3702, 3639, 3576, 3513,
+3450, 3387, 3324, 3261, 3198, 3135, 4081, 4018, 3955, 3892, 3829, 3766, 3703,
+3640, 3577, 3514, 3451, 3388, 3325, 3262, 3199, 4082, 4019, 3956, 3893, 3830,
+3767, 3704, 3641, 3578, 3515, 3452, 3389, 3326, 3263, 4083, 4020, 3957, 3894,
+3831, 3768, 3705, 3642, 3579, 3516, 3453, 3390, 3327, 4084, 4021, 3958, 3895,
+3832, 3769, 3706, 3643, 3580, 3517, 3454, 3391, 4085, 4022, 3959, 3896, 3833,
+3770, 3707, 3644, 3581, 3518, 3455, 4086, 4023, 3960, 3897, 3834, 3771, 3708,
+3645, 3582, 3519, 4087, 4024, 3961, 3898, 3835, 3772, 3709, 3646, 3583, 4088,
+4025, 3962, 3899, 3836, 3773, 3710, 3647, 4089, 4026, 3963, 3900, 3837, 3774,
+3711, 4090, 4027, 3964, 3901, 3838, 3775, 4091, 4028, 3965, 3902, 3839, 4092,
+4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095,    0,    0,    1,    0, // 4x4 GROUPED 1xN, 1x2, 1x4
+   1,    2,    3,    0,    1,    2,    3,    4,    5,    6,    7,    0,    1, // 1x8, 1x16
+   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
+  15,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11, // 1x32
+  12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,
+  25,   26,   27,   28,   29,   30,   31,    0,    1,    2,    3,    4,    5, // 1x64
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
+  19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,    0,    1,    0,    2,    1,    3,    0, // 2xN, 2x2, 2x4
+   2,    1,    3,    4,    6,    5,    7,    0,    2,    1,    4,    3,    6, // 2x8
+   5,    8,    7,   10,    9,   12,   11,   14,   13,   15,    0,    2,    1, // 2x16
+   4,    3,    6,    5,    8,    7,   10,    9,   12,   11,   14,   13,   15,
+  16,   18,   17,   20,   19,   22,   21,   24,   23,   26,   25,   28,   27,
+  30,   29,   31,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10, // 2x32
+   9,   12,   11,   14,   13,   15,   16,   18,   17,   20,   19,   22,   21,
+  24,   23,   26,   25,   28,   27,   30,   29,   31,   32,   34,   33,   36,
+  35,   38,   37,   40,   39,   42,   41,   44,   43,   46,   45,   47,   48,
+  50,   49,   52,   51,   54,   53,   56,   55,   58,   57,   60,   59,   62,
+  61,   63,    0,    2,    1,    4,    3,    6,    5,    8,    7,   10,    9, // 2x64
+  12,   11,   14,   13,   15,   16,   18,   17,   20,   19,   22,   21,   24,
+  23,   26,   25,   28,   27,   30,   29,   31,   32,   34,   33,   36,   35,
+  38,   37,   40,   39,   42,   41,   44,   43,   46,   45,   47,   48,   50,
+  49,   52,   51,   54,   53,   56,   55,   58,   57,   60,   59,   62,   61,
+  63,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+   0,    1,    2,    3,    0,    4,    1,    5,    2,    6,    3,    7,    0, // 4xN, 4x2, 4x4
+   4,    1,    8,    5,    2,   12,    9,    6,    3,   13,   10,    7,   14,
+  11,   15,    0,    4,    1,    8,    5,    2,   12,    9,    6,    3,   13, // 4x8
+  10,    7,   14,   11,   15,   16,   20,   17,   24,   21,   18,   28,   25,
+  22,   19,   29,   26,   23,   30,   27,   31,    0,    4,    1,    8,    5, // 4x16
+   2,   12,    9,    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,
+  17,   24,   21,   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,
+  31,   32,   36,   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,
+  39,   46,   43,   47,   48,   52,   49,   56,   53,   50,   60,   57,   54,
+  51,   61,   58,   55,   62,   59,   63,    0,    4,    1,    8,    5,    2, // 4x32
+  12,    9,    6,    3,   13,   10,    7,   14,   11,   15,   16,   20,   17,
+  24,   21,   18,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,
+  32,   36,   33,   40,   37,   34,   44,   41,   38,   35,   45,   42,   39,
+  46,   43,   47,   48,   52,   49,   56,   53,   50,   60,   57,   54,   51,
+  61,   58,   55,   62,   59,   63,   64,   68,   65,   72,   69,   66,   76,
+  73,   70,   67,   77,   74,   71,   78,   75,   79,   80,   84,   81,   88,
+  85,   82,   92,   89,   86,   83,   93,   90,   87,   94,   91,   95,   96,
+ 100,   97,  104,  101,   98,  108,  105,  102,   99,  109,  106,  103,  110,
+ 107,  111,  112,  116,  113,  120,  117,  114,  124,  121,  118,  115,  125,
+ 122,  119,  126,  123,  127,    0,    4,    1,    8,    5,    2,   12,    9, // 4x64
+   6,    3,   13,   10,    7,   14,   11,   15,   16,   20,   17,   24,   21,
+  18,   28,   25,   22,   19,   29,   26,   23,   30,   27,   31,   32,   36,
+  33,   40,   37,   34,   44,   41,   38,   35,   45,   42,   39,   46,   43,
+  47,   48,   52,   49,   56,   53,   50,   60,   57,   54,   51,   61,   58,
+  55,   62,   59,   63,   64,   68,   65,   72,   69,   66,   76,   73,   70,
+  67,   77,   74,   71,   78,   75,   79,   80,   84,   81,   88,   85,   82,
+  92,   89,   86,   83,   93,   90,   87,   94,   91,   95,   96,  100,   97,
+ 104,  101,   98,  108,  105,  102,   99,  109,  106,  103,  110,  107,  111,
+ 112,  116,  113,  120,  117,  114,  124,  121,  118,  115,  125,  122,  119,
+ 126,  123,  127,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,    0,    1,    2,    3,    4,    5,    6,    7,    0,    8,    1,    9, // 8xN, 8x2
+   2,   10,    3,   11,    4,   12,    5,   13,    6,   14,    7,   15,    0, // 8x4
+   8,    1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,
+  19,   27,    4,   12,    5,   20,   13,    6,   28,   21,   14,    7,   29,
+  22,   15,   30,   23,   31,    0,    8,    1,   16,    9,    2,   24,   17, // 8x8
+  10,    3,   25,   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,
+  34,   56,   49,   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,
+   5,   20,   13,    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,
+  31,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,   61,   54,
+  47,   62,   55,   63,    0,    8,    1,   16,    9,    2,   24,   17,   10, // 8x16
+   3,   25,   18,   11,   26,   19,   27,   32,   40,   33,   48,   41,   34,
+  56,   49,   42,   35,   57,   50,   43,   58,   51,   59,    4,   12,    5,
+  20,   13,    6,   28,   21,   14,    7,   29,   22,   15,   30,   23,   31,
+  64,   72,   65,   80,   73,   66,   88,   81,   74,   67,   89,   82,   75,
+  90,   83,   91,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,
+  61,   54,   47,   62,   55,   63,   96,  104,   97,  112,  105,   98,  120,
+ 113,  106,   99,  121,  114,  107,  122,  115,  123,   68,   76,   69,   84,
+  77,   70,   92,   85,   78,   71,   93,   86,   79,   94,   87,   95,  100,
+ 108,  101,  116,  109,  102,  124,  117,  110,  103,  125,  118,  111,  126,
+ 119,  127,    0,    8,    1,   16,    9,    2,   24,   17,   10,    3,   25, // 8x32
+  18,   11,   26,   19,   27,   32,   40,   33,   48,   41,   34,   56,   49,
+  42,   35,   57,   50,   43,   58,   51,   59,    4,   12,    5,   20,   13,
+   6,   28,   21,   14,    7,   29,   22,   15,   30,   23,   31,   64,   72,
+  65,   80,   73,   66,   88,   81,   74,   67,   89,   82,   75,   90,   83,
+  91,   36,   44,   37,   52,   45,   38,   60,   53,   46,   39,   61,   54,
+  47,   62,   55,   63,   96,  104,   97,  112,  105,   98,  120,  113,  106,
+  99,  121,  114,  107,  122,  115,  123,   68,   76,   69,   84,   77,   70,
+  92,   85,   78,   71,   93,   86,   79,   94,   87,   95,  128,  136,  129,
+ 144,  137,  130,  152,  145,  138,  131,  153,  146,  139,  154,  147,  155,
+ 100,  108,  101,  116,  109,  102,  124,  117,  110,  103,  125,  118,  111,
+ 126,  119,  127,  160,  168,  161,  176,  169,  162,  184,  177,  170,  163,
+ 185,  178,  171,  186,  179,  187,  132,  140,  133,  148,  141,  134,  156,
+ 149,  142,  135,  157,  150,  143,  158,  151,  159,  192,  200,  193,  208,
+ 201,  194,  216,  209,  202,  195,  217,  210,  203,  218,  211,  219,  164,
+ 172,  165,  180,  173,  166,  188,  181,  174,  167,  189,  182,  175,  190,
+ 183,  191,  224,  232,  225,  240,  233,  226,  248,  241,  234,  227,  249,
+ 242,  235,  250,  243,  251,  196,  204,  197,  212,  205,  198,  220,  213,
+ 206,  199,  221,  214,  207,  222,  215,  223,  228,  236,  229,  244,  237,
+ 230,  252,  245,  238,  231,  253,  246,  239,  254,  247,  255,    0,    8, // 8x64
+   1,   16,    9,    2,   24,   17,   10,    3,   25,   18,   11,   26,   19,
+  27,   32,   40,   33,   48,   41,   34,   56,   49,   42,   35,   57,   50,
+  43,   58,   51,   59,    4,   12,    5,   20,   13,    6,   28,   21,   14,
+   7,   29,   22,   15,   30,   23,   31,   64,   72,   65,   80,   73,   66,
+  88,   81,   74,   67,   89,   82,   75,   90,   83,   91,   36,   44,   37,
+  52,   45,   38,   60,   53,   46,   39,   61,   54,   47,   62,   55,   63,
+  96,  104,   97,  112,  105,   98,  120,  113,  106,   99,  121,  114,  107,
+ 122,  115,  123,   68,   76,   69,   84,   77,   70,   92,   85,   78,   71,
+  93,   86,   79,   94,   87,   95,  128,  136,  129,  144,  137,  130,  152,
+ 145,  138,  131,  153,  146,  139,  154,  147,  155,  100,  108,  101,  116,
+ 109,  102,  124,  117,  110,  103,  125,  118,  111,  126,  119,  127,  160,
+ 168,  161,  176,  169,  162,  184,  177,  170,  163,  185,  178,  171,  186,
+ 179,  187,  132,  140,  133,  148,  141,  134,  156,  149,  142,  135,  157,
+ 150,  143,  158,  151,  159,  192,  200,  193,  208,  201,  194,  216,  209,
+ 202,  195,  217,  210,  203,  218,  211,  219,  164,  172,  165,  180,  173,
+ 166,  188,  181,  174,  167,  189,  182,  175,  190,  183,  191,  224,  232,
+ 225,  240,  233,  226,  248,  241,  234,  227,  249,  242,  235,  250,  243,
+ 251,  196,  204,  197,  212,  205,  198,  220,  213,  206,  199,  221,  214,
+ 207,  222,  215,  223,  228,  236,  229,  244,  237,  230,  252,  245,  238,
+ 231,  253,  246,  239,  254,  247,  255,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9, // 16xN
+  10,   11,   12,   13,   14,   15,    0,   16,    1,   17,    2,   18,    3, // 16x2
+  19,    4,   20,    5,   21,    6,   22,    7,   23,    8,   24,    9,   25,
+  10,   26,   11,   27,   12,   28,   13,   29,   14,   30,   15,   31,    0, // 16x4
+  16,    1,   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,
+  35,   51,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,   53,
+  38,   23,   54,   39,   55,    8,   24,    9,   40,   25,   10,   56,   41,
+  26,   11,   57,   42,   27,   58,   43,   59,   12,   28,   13,   44,   29,
+  14,   60,   45,   30,   15,   61,   46,   31,   62,   47,   63,    0,   16, // 16x8
+   1,   32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,
+  51,   64,   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,
+  83,  114,   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,
+   7,   53,   38,   23,   54,   39,   55,   68,   84,   69,  100,   85,   70,
+ 116,  101,   86,   71,  117,  102,   87,  118,  103,  119,    8,   24,    9,
+  40,   25,   10,   56,   41,   26,   11,   57,   42,   27,   58,   43,   59,
+  72,   88,   73,  104,   89,   74,  120,  105,   90,   75,  121,  106,   91,
+ 122,  107,  123,   12,   28,   13,   44,   29,   14,   60,   45,   30,   15,
+  61,   46,   31,   62,   47,   63,   76,   92,   77,  108,   93,   78,  124,
+ 109,   94,   79,  125,  110,   95,  126,  111,  127,    0,   16,    1,   32, // 16x16
+  17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,   51,   64,
+  80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,   83,  114,
+  99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,   53,
+  38,   23,   54,   39,   55,  128,  144,  129,  160,  145,  130,  176,  161,
+ 146,  131,  177,  162,  147,  178,  163,  179,   68,   84,   69,  100,   85,
+  70,  116,  101,   86,   71,  117,  102,   87,  118,  103,  119,    8,   24,
+   9,   40,   25,   10,   56,   41,   26,   11,   57,   42,   27,   58,   43,
+  59,  192,  208,  193,  224,  209,  194,  240,  225,  210,  195,  241,  226,
+ 211,  242,  227,  243,  132,  148,  133,  164,  149,  134,  180,  165,  150,
+ 135,  181,  166,  151,  182,  167,  183,   72,   88,   73,  104,   89,   74,
+ 120,  105,   90,   75,  121,  106,   91,  122,  107,  123,   12,   28,   13,
+  44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,   47,   63,
+ 196,  212,  197,  228,  213,  198,  244,  229,  214,  199,  245,  230,  215,
+ 246,  231,  247,  136,  152,  137,  168,  153,  138,  184,  169,  154,  139,
+ 185,  170,  155,  186,  171,  187,   76,   92,   77,  108,   93,   78,  124,
+ 109,   94,   79,  125,  110,   95,  126,  111,  127,  200,  216,  201,  232,
+ 217,  202,  248,  233,  218,  203,  249,  234,  219,  250,  235,  251,  140,
+ 156,  141,  172,  157,  142,  188,  173,  158,  143,  189,  174,  159,  190,
+ 175,  191,  204,  220,  205,  236,  221,  206,  252,  237,  222,  207,  253,
+ 238,  223,  254,  239,  255,    0,   16,    1,   32,   17,    2,   48,   33, // 16x32
+  18,    3,   49,   34,   19,   50,   35,   51,   64,   80,   65,   96,   81,
+  66,  112,   97,   82,   67,  113,   98,   83,  114,   99,  115,    4,   20,
+   5,   36,   21,    6,   52,   37,   22,    7,   53,   38,   23,   54,   39,
+  55,  128,  144,  129,  160,  145,  130,  176,  161,  146,  131,  177,  162,
+ 147,  178,  163,  179,   68,   84,   69,  100,   85,   70,  116,  101,   86,
+  71,  117,  102,   87,  118,  103,  119,    8,   24,    9,   40,   25,   10,
+  56,   41,   26,   11,   57,   42,   27,   58,   43,   59,  192,  208,  193,
+ 224,  209,  194,  240,  225,  210,  195,  241,  226,  211,  242,  227,  243,
+ 132,  148,  133,  164,  149,  134,  180,  165,  150,  135,  181,  166,  151,
+ 182,  167,  183,   72,   88,   73,  104,   89,   74,  120,  105,   90,   75,
+ 121,  106,   91,  122,  107,  123,   12,   28,   13,   44,   29,   14,   60,
+  45,   30,   15,   61,   46,   31,   62,   47,   63,  256,  272,  257,  288,
+ 273,  258,  304,  289,  274,  259,  305,  290,  275,  306,  291,  307,  196,
+ 212,  197,  228,  213,  198,  244,  229,  214,  199,  245,  230,  215,  246,
+ 231,  247,  136,  152,  137,  168,  153,  138,  184,  169,  154,  139,  185,
+ 170,  155,  186,  171,  187,   76,   92,   77,  108,   93,   78,  124,  109,
+  94,   79,  125,  110,   95,  126,  111,  127,  320,  336,  321,  352,  337,
+ 322,  368,  353,  338,  323,  369,  354,  339,  370,  355,  371,  260,  276,
+ 261,  292,  277,  262,  308,  293,  278,  263,  309,  294,  279,  310,  295,
+ 311,  200,  216,  201,  232,  217,  202,  248,  233,  218,  203,  249,  234,
+ 219,  250,  235,  251,  140,  156,  141,  172,  157,  142,  188,  173,  158,
+ 143,  189,  174,  159,  190,  175,  191,  384,  400,  385,  416,  401,  386,
+ 432,  417,  402,  387,  433,  418,  403,  434,  419,  435,  324,  340,  325,
+ 356,  341,  326,  372,  357,  342,  327,  373,  358,  343,  374,  359,  375,
+ 264,  280,  265,  296,  281,  266,  312,  297,  282,  267,  313,  298,  283,
+ 314,  299,  315,  204,  220,  205,  236,  221,  206,  252,  237,  222,  207,
+ 253,  238,  223,  254,  239,  255,  448,  464,  449,  480,  465,  450,  496,
+ 481,  466,  451,  497,  482,  467,  498,  483,  499,  388,  404,  389,  420,
+ 405,  390,  436,  421,  406,  391,  437,  422,  407,  438,  423,  439,  328,
+ 344,  329,  360,  345,  330,  376,  361,  346,  331,  377,  362,  347,  378,
+ 363,  379,  268,  284,  269,  300,  285,  270,  316,  301,  286,  271,  317,
+ 302,  287,  318,  303,  319,  452,  468,  453,  484,  469,  454,  500,  485,
+ 470,  455,  501,  486,  471,  502,  487,  503,  392,  408,  393,  424,  409,
+ 394,  440,  425,  410,  395,  441,  426,  411,  442,  427,  443,  332,  348,
+ 333,  364,  349,  334,  380,  365,  350,  335,  381,  366,  351,  382,  367,
+ 383,  456,  472,  457,  488,  473,  458,  504,  489,  474,  459,  505,  490,
+ 475,  506,  491,  507,  396,  412,  397,  428,  413,  398,  444,  429,  414,
+ 399,  445,  430,  415,  446,  431,  447,  460,  476,  461,  492,  477,  462,
+ 508,  493,  478,  463,  509,  494,  479,  510,  495,  511,    0,   16,    1, // 16x64
+  32,   17,    2,   48,   33,   18,    3,   49,   34,   19,   50,   35,   51,
+  64,   80,   65,   96,   81,   66,  112,   97,   82,   67,  113,   98,   83,
+ 114,   99,  115,    4,   20,    5,   36,   21,    6,   52,   37,   22,    7,
+  53,   38,   23,   54,   39,   55,  128,  144,  129,  160,  145,  130,  176,
+ 161,  146,  131,  177,  162,  147,  178,  163,  179,   68,   84,   69,  100,
+  85,   70,  116,  101,   86,   71,  117,  102,   87,  118,  103,  119,    8,
+  24,    9,   40,   25,   10,   56,   41,   26,   11,   57,   42,   27,   58,
+  43,   59,  192,  208,  193,  224,  209,  194,  240,  225,  210,  195,  241,
+ 226,  211,  242,  227,  243,  132,  148,  133,  164,  149,  134,  180,  165,
+ 150,  135,  181,  166,  151,  182,  167,  183,   72,   88,   73,  104,   89,
+  74,  120,  105,   90,   75,  121,  106,   91,  122,  107,  123,   12,   28,
+  13,   44,   29,   14,   60,   45,   30,   15,   61,   46,   31,   62,   47,
+  63,  256,  272,  257,  288,  273,  258,  304,  289,  274,  259,  305,  290,
+ 275,  306,  291,  307,  196,  212,  197,  228,  213,  198,  244,  229,  214,
+ 199,  245,  230,  215,  246,  231,  247,  136,  152,  137,  168,  153,  138,
+ 184,  169,  154,  139,  185,  170,  155,  186,  171,  187,   76,   92,   77,
+ 108,   93,   78,  124,  109,   94,   79,  125,  110,   95,  126,  111,  127,
+ 320,  336,  321,  352,  337,  322,  368,  353,  338,  323,  369,  354,  339,
+ 370,  355,  371,  260,  276,  261,  292,  277,  262,  308,  293,  278,  263,
+ 309,  294,  279,  310,  295,  311,  200,  216,  201,  232,  217,  202,  248,
+ 233,  218,  203,  249,  234,  219,  250,  235,  251,  140,  156,  141,  172,
+ 157,  142,  188,  173,  158,  143,  189,  174,  159,  190,  175,  191,  384,
+ 400,  385,  416,  401,  386,  432,  417,  402,  387,  433,  418,  403,  434,
+ 419,  435,  324,  340,  325,  356,  341,  326,  372,  357,  342,  327,  373,
+ 358,  343,  374,  359,  375,  264,  280,  265,  296,  281,  266,  312,  297,
+ 282,  267,  313,  298,  283,  314,  299,  315,  204,  220,  205,  236,  221,
+ 206,  252,  237,  222,  207,  253,  238,  223,  254,  239,  255,  448,  464,
+ 449,  480,  465,  450,  496,  481,  466,  451,  497,  482,  467,  498,  483,
+ 499,  388,  404,  389,  420,  405,  390,  436,  421,  406,  391,  437,  422,
+ 407,  438,  423,  439,  328,  344,  329,  360,  345,  330,  376,  361,  346,
+ 331,  377,  362,  347,  378,  363,  379,  268,  284,  269,  300,  285,  270,
+ 316,  301,  286,  271,  317,  302,  287,  318,  303,  319,  452,  468,  453,
+ 484,  469,  454,  500,  485,  470,  455,  501,  486,  471,  502,  487,  503,
+ 392,  408,  393,  424,  409,  394,  440,  425,  410,  395,  441,  426,  411,
+ 442,  427,  443,  332,  348,  333,  364,  349,  334,  380,  365,  350,  335,
+ 381,  366,  351,  382,  367,  383,  456,  472,  457,  488,  473,  458,  504,
+ 489,  474,  459,  505,  490,  475,  506,  491,  507,  396,  412,  397,  428,
+ 413,  398,  444,  429,  414,  399,  445,  430,  415,  446,  431,  447,  460,
+ 476,  461,  492,  477,  462,  508,  493,  478,  463,  509,  494,  479,  510,
+ 495,  511, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,    1,    2,    3,    4,    5, // 32xN
+   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,
+  19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,
+   0,   32,    1,   33,    2,   34,    3,   35,    4,   36,    5,   37,    6, // 32x2
+  38,    7,   39,    8,   40,    9,   41,   10,   42,   11,   43,   12,   44,
+  13,   45,   14,   46,   15,   47,   16,   48,   17,   49,   18,   50,   19,
+  51,   20,   52,   21,   53,   22,   54,   23,   55,   24,   56,   25,   57,
+  26,   58,   27,   59,   28,   60,   29,   61,   30,   62,   31,   63,    0, // 32x4
+  32,    1,   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,
+  67,   99,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,
+  70,   39,  102,   71,  103,    8,   40,    9,   72,   41,   10,  104,   73,
+  42,   11,  105,   74,   43,  106,   75,  107,   12,   44,   13,   76,   45,
+  14,  108,   77,   46,   15,  109,   78,   47,  110,   79,  111,   16,   48,
+  17,   80,   49,   18,  112,   81,   50,   19,  113,   82,   51,  114,   83,
+ 115,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,  117,   86,
+  55,  118,   87,  119,   24,   56,   25,   88,   57,   26,  120,   89,   58,
+  27,  121,   90,   59,  122,   91,  123,   28,   60,   29,   92,   61,   30,
+ 124,   93,   62,   31,  125,   94,   63,  126,   95,  127,    0,   32,    1, // 32x8
+  64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,   99,
+ 128,  160,  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,  163,
+ 226,  195,  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,
+ 101,   70,   39,  102,   71,  103,  132,  164,  133,  196,  165,  134,  228,
+ 197,  166,  135,  229,  198,  167,  230,  199,  231,    8,   40,    9,   72,
+  41,   10,  104,   73,   42,   11,  105,   74,   43,  106,   75,  107,  136,
+ 168,  137,  200,  169,  138,  232,  201,  170,  139,  233,  202,  171,  234,
+ 203,  235,   12,   44,   13,   76,   45,   14,  108,   77,   46,   15,  109,
+  78,   47,  110,   79,  111,  140,  172,  141,  204,  173,  142,  236,  205,
+ 174,  143,  237,  206,  175,  238,  207,  239,   16,   48,   17,   80,   49,
+  18,  112,   81,   50,   19,  113,   82,   51,  114,   83,  115,  144,  176,
+ 145,  208,  177,  146,  240,  209,  178,  147,  241,  210,  179,  242,  211,
+ 243,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,  117,   86,
+  55,  118,   87,  119,  148,  180,  149,  212,  181,  150,  244,  213,  182,
+ 151,  245,  214,  183,  246,  215,  247,   24,   56,   25,   88,   57,   26,
+ 120,   89,   58,   27,  121,   90,   59,  122,   91,  123,  152,  184,  153,
+ 216,  185,  154,  248,  217,  186,  155,  249,  218,  187,  250,  219,  251,
+  28,   60,   29,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,
+ 126,   95,  127,  156,  188,  157,  220,  189,  158,  252,  221,  190,  159,
+ 253,  222,  191,  254,  223,  255,    0,   32,    1,   64,   33,    2,   96, // 32x16
+  65,   34,    3,   97,   66,   35,   98,   67,   99,  128,  160,  129,  192,
+ 161,  130,  224,  193,  162,  131,  225,  194,  163,  226,  195,  227,    4,
+  36,    5,   68,   37,    6,  100,   69,   38,    7,  101,   70,   39,  102,
+  71,  103,  256,  288,  257,  320,  289,  258,  352,  321,  290,  259,  353,
+ 322,  291,  354,  323,  355,  132,  164,  133,  196,  165,  134,  228,  197,
+ 166,  135,  229,  198,  167,  230,  199,  231,    8,   40,    9,   72,   41,
+  10,  104,   73,   42,   11,  105,   74,   43,  106,   75,  107,  384,  416,
+ 385,  448,  417,  386,  480,  449,  418,  387,  481,  450,  419,  482,  451,
+ 483,  260,  292,  261,  324,  293,  262,  356,  325,  294,  263,  357,  326,
+ 295,  358,  327,  359,  136,  168,  137,  200,  169,  138,  232,  201,  170,
+ 139,  233,  202,  171,  234,  203,  235,   12,   44,   13,   76,   45,   14,
+ 108,   77,   46,   15,  109,   78,   47,  110,   79,  111,  388,  420,  389,
+ 452,  421,  390,  484,  453,  422,  391,  485,  454,  423,  486,  455,  487,
+ 264,  296,  265,  328,  297,  266,  360,  329,  298,  267,  361,  330,  299,
+ 362,  331,  363,  140,  172,  141,  204,  173,  142,  236,  205,  174,  143,
+ 237,  206,  175,  238,  207,  239,   16,   48,   17,   80,   49,   18,  112,
+  81,   50,   19,  113,   82,   51,  114,   83,  115,  392,  424,  393,  456,
+ 425,  394,  488,  457,  426,  395,  489,  458,  427,  490,  459,  491,  268,
+ 300,  269,  332,  301,  270,  364,  333,  302,  271,  365,  334,  303,  366,
+ 335,  367,  144,  176,  145,  208,  177,  146,  240,  209,  178,  147,  241,
+ 210,  179,  242,  211,  243,   20,   52,   21,   84,   53,   22,  116,   85,
+  54,   23,  117,   86,   55,  118,   87,  119,  396,  428,  397,  460,  429,
+ 398,  492,  461,  430,  399,  493,  462,  431,  494,  463,  495,  272,  304,
+ 273,  336,  305,  274,  368,  337,  306,  275,  369,  338,  307,  370,  339,
+ 371,  148,  180,  149,  212,  181,  150,  244,  213,  182,  151,  245,  214,
+ 183,  246,  215,  247,   24,   56,   25,   88,   57,   26,  120,   89,   58,
+  27,  121,   90,   59,  122,   91,  123,  400,  432,  401,  464,  433,  402,
+ 496,  465,  434,  403,  497,  466,  435,  498,  467,  499,  276,  308,  277,
+ 340,  309,  278,  372,  341,  310,  279,  373,  342,  311,  374,  343,  375,
+ 152,  184,  153,  216,  185,  154,  248,  217,  186,  155,  249,  218,  187,
+ 250,  219,  251,   28,   60,   29,   92,   61,   30,  124,   93,   62,   31,
+ 125,   94,   63,  126,   95,  127,  404,  436,  405,  468,  437,  406,  500,
+ 469,  438,  407,  501,  470,  439,  502,  471,  503,  280,  312,  281,  344,
+ 313,  282,  376,  345,  314,  283,  377,  346,  315,  378,  347,  379,  156,
+ 188,  157,  220,  189,  158,  252,  221,  190,  159,  253,  222,  191,  254,
+ 223,  255,  408,  440,  409,  472,  441,  410,  504,  473,  442,  411,  505,
+ 474,  443,  506,  475,  507,  284,  316,  285,  348,  317,  286,  380,  349,
+ 318,  287,  381,  350,  319,  382,  351,  383,  412,  444,  413,  476,  445,
+ 414,  508,  477,  446,  415,  509,  478,  447,  510,  479,  511,    0,   32, // 32x32
+   1,   64,   33,    2,   96,   65,   34,    3,   97,   66,   35,   98,   67,
+  99,  128,  160,  129,  192,  161,  130,  224,  193,  162,  131,  225,  194,
+ 163,  226,  195,  227,    4,   36,    5,   68,   37,    6,  100,   69,   38,
+   7,  101,   70,   39,  102,   71,  103,  256,  288,  257,  320,  289,  258,
+ 352,  321,  290,  259,  353,  322,  291,  354,  323,  355,  132,  164,  133,
+ 196,  165,  134,  228,  197,  166,  135,  229,  198,  167,  230,  199,  231,
+   8,   40,    9,   72,   41,   10,  104,   73,   42,   11,  105,   74,   43,
+ 106,   75,  107,  384,  416,  385,  448,  417,  386,  480,  449,  418,  387,
+ 481,  450,  419,  482,  451,  483,  260,  292,  261,  324,  293,  262,  356,
+ 325,  294,  263,  357,  326,  295,  358,  327,  359,  136,  168,  137,  200,
+ 169,  138,  232,  201,  170,  139,  233,  202,  171,  234,  203,  235,   12,
+  44,   13,   76,   45,   14,  108,   77,   46,   15,  109,   78,   47,  110,
+  79,  111,  512,  544,  513,  576,  545,  514,  608,  577,  546,  515,  609,
+ 578,  547,  610,  579,  611,  388,  420,  389,  452,  421,  390,  484,  453,
+ 422,  391,  485,  454,  423,  486,  455,  487,  264,  296,  265,  328,  297,
+ 266,  360,  329,  298,  267,  361,  330,  299,  362,  331,  363,  140,  172,
+ 141,  204,  173,  142,  236,  205,  174,  143,  237,  206,  175,  238,  207,
+ 239,   16,   48,   17,   80,   49,   18,  112,   81,   50,   19,  113,   82,
+  51,  114,   83,  115,  640,  672,  641,  704,  673,  642,  736,  705,  674,
+ 643,  737,  706,  675,  738,  707,  739,  516,  548,  517,  580,  549,  518,
+ 612,  581,  550,  519,  613,  582,  551,  614,  583,  615,  392,  424,  393,
+ 456,  425,  394,  488,  457,  426,  395,  489,  458,  427,  490,  459,  491,
+ 268,  300,  269,  332,  301,  270,  364,  333,  302,  271,  365,  334,  303,
+ 366,  335,  367,  144,  176,  145,  208,  177,  146,  240,  209,  178,  147,
+ 241,  210,  179,  242,  211,  243,   20,   52,   21,   84,   53,   22,  116,
+  85,   54,   23,  117,   86,   55,  118,   87,  119,  768,  800,  769,  832,
+ 801,  770,  864,  833,  802,  771,  865,  834,  803,  866,  835,  867,  644,
+ 676,  645,  708,  677,  646,  740,  709,  678,  647,  741,  710,  679,  742,
+ 711,  743,  520,  552,  521,  584,  553,  522,  616,  585,  554,  523,  617,
+ 586,  555,  618,  587,  619,  396,  428,  397,  460,  429,  398,  492,  461,
+ 430,  399,  493,  462,  431,  494,  463,  495,  272,  304,  273,  336,  305,
+ 274,  368,  337,  306,  275,  369,  338,  307,  370,  339,  371,  148,  180,
+ 149,  212,  181,  150,  244,  213,  182,  151,  245,  214,  183,  246,  215,
+ 247,   24,   56,   25,   88,   57,   26,  120,   89,   58,   27,  121,   90,
+  59,  122,   91,  123,  896,  928,  897,  960,  929,  898,  992,  961,  930,
+ 899,  993,  962,  931,  994,  963,  995,  772,  804,  773,  836,  805,  774,
+ 868,  837,  806,  775,  869,  838,  807,  870,  839,  871,  648,  680,  649,
+ 712,  681,  650,  744,  713,  682,  651,  745,  714,  683,  746,  715,  747,
+ 524,  556,  525,  588,  557,  526,  620,  589,  558,  527,  621,  590,  559,
+ 622,  591,  623,  400,  432,  401,  464,  433,  402,  496,  465,  434,  403,
+ 497,  466,  435,  498,  467,  499,  276,  308,  277,  340,  309,  278,  372,
+ 341,  310,  279,  373,  342,  311,  374,  343,  375,  152,  184,  153,  216,
+ 185,  154,  248,  217,  186,  155,  249,  218,  187,  250,  219,  251,   28,
+  60,   29,   92,   61,   30,  124,   93,   62,   31,  125,   94,   63,  126,
+  95,  127,  900,  932,  901,  964,  933,  902,  996,  965,  934,  903,  997,
+ 966,  935,  998,  967,  999,  776,  808,  777,  840,  809,  778,  872,  841,
+ 810,  779,  873,  842,  811,  874,  843,  875,  652,  684,  653,  716,  685,
+ 654,  748,  717,  686,  655,  749,  718,  687,  750,  719,  751,  528,  560,
+ 529,  592,  561,  530,  624,  593,  562,  531,  625,  594,  563,  626,  595,
+ 627,  404,  436,  405,  468,  437,  406,  500,  469,  438,  407,  501,  470,
+ 439,  502,  471,  503,  280,  312,  281,  344,  313,  282,  376,  345,  314,
+ 283,  377,  346,  315,  378,  347,  379,  156,  188,  157,  220,  189,  158,
+ 252,  221,  190,  159,  253,  222,  191,  254,  223,  255,  904,  936,  905,
+ 968,  937,  906, 1000,  969,  938,  907, 1001,  970,  939, 1002,  971, 1003,
+ 780,  812,  781,  844,  813,  782,  876,  845,  814,  783,  877,  846,  815,
+ 878,  847,  879,  656,  688,  657,  720,  689,  658,  752,  721,  690,  659,
+ 753,  722,  691,  754,  723,  755,  532,  564,  533,  596,  565,  534,  628,
+ 597,  566,  535,  629,  598,  567,  630,  599,  631,  408,  440,  409,  472,
+ 441,  410,  504,  473,  442,  411,  505,  474,  443,  506,  475,  507,  284,
+ 316,  285,  348,  317,  286,  380,  349,  318,  287,  381,  350,  319,  382,
+ 351,  383,  908,  940,  909,  972,  941,  910, 1004,  973,  942,  911, 1005,
+ 974,  943, 1006,  975, 1007,  784,  816,  785,  848,  817,  786,  880,  849,
+ 818,  787,  881,  850,  819,  882,  851,  883,  660,  692,  661,  724,  693,
+ 662,  756,  725,  694,  663,  757,  726,  695,  758,  727,  759,  536,  568,
+ 537,  600,  569,  538,  632,  601,  570,  539,  633,  602,  571,  634,  603,
+ 635,  412,  444,  413,  476,  445,  414,  508,  477,  446,  415,  509,  478,
+ 447,  510,  479,  511,  912,  944,  913,  976,  945,  914, 1008,  977,  946,
+ 915, 1009,  978,  947, 1010,  979, 1011,  788,  820,  789,  852,  821,  790,
+ 884,  853,  822,  791,  885,  854,  823,  886,  855,  887,  664,  696,  665,
+ 728,  697,  666,  760,  729,  698,  667,  761,  730,  699,  762,  731,  763,
+ 540,  572,  541,  604,  573,  542,  636,  605,  574,  543,  637,  606,  575,
+ 638,  607,  639,  916,  948,  917,  980,  949,  918, 1012,  981,  950,  919,
+1013,  982,  951, 1014,  983, 1015,  792,  824,  793,  856,  825,  794,  888,
+ 857,  826,  795,  889,  858,  827,  890,  859,  891,  668,  700,  669,  732,
+ 701,  670,  764,  733,  702,  671,  765,  734,  703,  766,  735,  767,  920,
+ 952,  921,  984,  953,  922, 1016,  985,  954,  923, 1017,  986,  955, 1018,
+ 987, 1019,  796,  828,  797,  860,  829,  798,  892,  861,  830,  799,  893,
+ 862,  831,  894,  863,  895,  924,  956,  925,  988,  957,  926, 1020,  989,
+ 958,  927, 1021,  990,  959, 1022,  991, 1023,    0,   32,    1,   64,   33, // 32x64
+   2,   96,   65,   34,    3,   97,   66,   35,   98,   67,   99,  128,  160,
+ 129,  192,  161,  130,  224,  193,  162,  131,  225,  194,  163,  226,  195,
+ 227,    4,   36,    5,   68,   37,    6,  100,   69,   38,    7,  101,   70,
+  39,  102,   71,  103,  256,  288,  257,  320,  289,  258,  352,  321,  290,
+ 259,  353,  322,  291,  354,  323,  355,  132,  164,  133,  196,  165,  134,
+ 228,  197,  166,  135,  229,  198,  167,  230,  199,  231,    8,   40,    9,
+  72,   41,   10,  104,   73,   42,   11,  105,   74,   43,  106,   75,  107,
+ 384,  416,  385,  448,  417,  386,  480,  449,  418,  387,  481,  450,  419,
+ 482,  451,  483,  260,  292,  261,  324,  293,  262,  356,  325,  294,  263,
+ 357,  326,  295,  358,  327,  359,  136,  168,  137,  200,  169,  138,  232,
+ 201,  170,  139,  233,  202,  171,  234,  203,  235,   12,   44,   13,   76,
+  45,   14,  108,   77,   46,   15,  109,   78,   47,  110,   79,  111,  512,
+ 544,  513,  576,  545,  514,  608,  577,  546,  515,  609,  578,  547,  610,
+ 579,  611,  388,  420,  389,  452,  421,  390,  484,  453,  422,  391,  485,
+ 454,  423,  486,  455,  487,  264,  296,  265,  328,  297,  266,  360,  329,
+ 298,  267,  361,  330,  299,  362,  331,  363,  140,  172,  141,  204,  173,
+ 142,  236,  205,  174,  143,  237,  206,  175,  238,  207,  239,   16,   48,
+  17,   80,   49,   18,  112,   81,   50,   19,  113,   82,   51,  114,   83,
+ 115,  640,  672,  641,  704,  673,  642,  736,  705,  674,  643,  737,  706,
+ 675,  738,  707,  739,  516,  548,  517,  580,  549,  518,  612,  581,  550,
+ 519,  613,  582,  551,  614,  583,  615,  392,  424,  393,  456,  425,  394,
+ 488,  457,  426,  395,  489,  458,  427,  490,  459,  491,  268,  300,  269,
+ 332,  301,  270,  364,  333,  302,  271,  365,  334,  303,  366,  335,  367,
+ 144,  176,  145,  208,  177,  146,  240,  209,  178,  147,  241,  210,  179,
+ 242,  211,  243,   20,   52,   21,   84,   53,   22,  116,   85,   54,   23,
+ 117,   86,   55,  118,   87,  119,  768,  800,  769,  832,  801,  770,  864,
+ 833,  802,  771,  865,  834,  803,  866,  835,  867,  644,  676,  645,  708,
+ 677,  646,  740,  709,  678,  647,  741,  710,  679,  742,  711,  743,  520,
+ 552,  521,  584,  553,  522,  616,  585,  554,  523,  617,  586,  555,  618,
+ 587,  619,  396,  428,  397,  460,  429,  398,  492,  461,  430,  399,  493,
+ 462,  431,  494,  463,  495,  272,  304,  273,  336,  305,  274,  368,  337,
+ 306,  275,  369,  338,  307,  370,  339,  371,  148,  180,  149,  212,  181,
+ 150,  244,  213,  182,  151,  245,  214,  183,  246,  215,  247,   24,   56,
+  25,   88,   57,   26,  120,   89,   58,   27,  121,   90,   59,  122,   91,
+ 123,  896,  928,  897,  960,  929,  898,  992,  961,  930,  899,  993,  962,
+ 931,  994,  963,  995,  772,  804,  773,  836,  805,  774,  868,  837,  806,
+ 775,  869,  838,  807,  870,  839,  871,  648,  680,  649,  712,  681,  650,
+ 744,  713,  682,  651,  745,  714,  683,  746,  715,  747,  524,  556,  525,
+ 588,  557,  526,  620,  589,  558,  527,  621,  590,  559,  622,  591,  623,
+ 400,  432,  401,  464,  433,  402,  496,  465,  434,  403,  497,  466,  435,
+ 498,  467,  499,  276,  308,  277,  340,  309,  278,  372,  341,  310,  279,
+ 373,  342,  311,  374,  343,  375,  152,  184,  153,  216,  185,  154,  248,
+ 217,  186,  155,  249,  218,  187,  250,  219,  251,   28,   60,   29,   92,
+  61,   30,  124,   93,   62,   31,  125,   94,   63,  126,   95,  127,  900,
+ 932,  901,  964,  933,  902,  996,  965,  934,  903,  997,  966,  935,  998,
+ 967,  999,  776,  808,  777,  840,  809,  778,  872,  841,  810,  779,  873,
+ 842,  811,  874,  843,  875,  652,  684,  653,  716,  685,  654,  748,  717,
+ 686,  655,  749,  718,  687,  750,  719,  751,  528,  560,  529,  592,  561,
+ 530,  624,  593,  562,  531,  625,  594,  563,  626,  595,  627,  404,  436,
+ 405,  468,  437,  406,  500,  469,  438,  407,  501,  470,  439,  502,  471,
+ 503,  280,  312,  281,  344,  313,  282,  376,  345,  314,  283,  377,  346,
+ 315,  378,  347,  379,  156,  188,  157,  220,  189,  158,  252,  221,  190,
+ 159,  253,  222,  191,  254,  223,  255,  904,  936,  905,  968,  937,  906,
+1000,  969,  938,  907, 1001,  970,  939, 1002,  971, 1003,  780,  812,  781,
+ 844,  813,  782,  876,  845,  814,  783,  877,  846,  815,  878,  847,  879,
+ 656,  688,  657,  720,  689,  658,  752,  721,  690,  659,  753,  722,  691,
+ 754,  723,  755,  532,  564,  533,  596,  565,  534,  628,  597,  566,  535,
+ 629,  598,  567,  630,  599,  631,  408,  440,  409,  472,  441,  410,  504,
+ 473,  442,  411,  505,  474,  443,  506,  475,  507,  284,  316,  285,  348,
+ 317,  286,  380,  349,  318,  287,  381,  350,  319,  382,  351,  383,  908,
+ 940,  909,  972,  941,  910, 1004,  973,  942,  911, 1005,  974,  943, 1006,
+ 975, 1007,  784,  816,  785,  848,  817,  786,  880,  849,  818,  787,  881,
+ 850,  819,  882,  851,  883,  660,  692,  661,  724,  693,  662,  756,  725,
+ 694,  663,  757,  726,  695,  758,  727,  759,  536,  568,  537,  600,  569,
+ 538,  632,  601,  570,  539,  633,  602,  571,  634,  603,  635,  412,  444,
+ 413,  476,  445,  414,  508,  477,  446,  415,  509,  478,  447,  510,  479,
+ 511,  912,  944,  913,  976,  945,  914, 1008,  977,  946,  915, 1009,  978,
+ 947, 1010,  979, 1011,  788,  820,  789,  852,  821,  790,  884,  853,  822,
+ 791,  885,  854,  823,  886,  855,  887,  664,  696,  665,  728,  697,  666,
+ 760,  729,  698,  667,  761,  730,  699,  762,  731,  763,  540,  572,  541,
+ 604,  573,  542,  636,  605,  574,  543,  637,  606,  575,  638,  607,  639,
+ 916,  948,  917,  980,  949,  918, 1012,  981,  950,  919, 1013,  982,  951,
+1014,  983, 1015,  792,  824,  793,  856,  825,  794,  888,  857,  826,  795,
+ 889,  858,  827,  890,  859,  891,  668,  700,  669,  732,  701,  670,  764,
+ 733,  702,  671,  765,  734,  703,  766,  735,  767,  920,  952,  921,  984,
+ 953,  922, 1016,  985,  954,  923, 1017,  986,  955, 1018,  987, 1019,  796,
+ 828,  797,  860,  829,  798,  892,  861,  830,  799,  893,  862,  831,  894,
+ 863,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,
+ 990,  959, 1022,  991, 1023, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10, // 64xN
+  11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,
+  24,   25,   26,   27,   28,   29,   30,   31,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,   63,
+  63,    0,   64,    1,   65,    2,   66,    3,   67,    4,   68,    5,   69, // 64x2
+   6,   70,    7,   71,    8,   72,    9,   73,   10,   74,   11,   75,   12,
+  76,   13,   77,   14,   78,   15,   79,   16,   80,   17,   81,   18,   82,
+  19,   83,   20,   84,   21,   85,   22,   86,   23,   87,   24,   88,   25,
+  89,   26,   90,   27,   91,   28,   92,   29,   93,   30,   94,   31,   95,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,
+ 127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,  127,    0, // 64x4
+  64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67,  194,
+ 131,  195,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,  197,
+ 134,   71,  198,  135,  199,    8,   72,    9,  136,   73,   10,  200,  137,
+  74,   11,  201,  138,   75,  202,  139,  203,   12,   76,   13,  140,   77,
+  14,  204,  141,   78,   15,  205,  142,   79,  206,  143,  207,   16,   80,
+  17,  144,   81,   18,  208,  145,   82,   19,  209,  146,   83,  210,  147,
+ 211,   20,   84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,
+  87,  214,  151,  215,   24,   88,   25,  152,   89,   26,  216,  153,   90,
+  27,  217,  154,   91,  218,  155,  219,   28,   92,   29,  156,   93,   30,
+ 220,  157,   94,   31,  221,  158,   95,  222,  159,  223,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,  255,
+ 255,  255,  255,  255,  255,  255,  255,  255,    0,   64,    1,  128,   65, // 64x8
+   2,  192,  129,   66,    3,  193,  130,   67,  194,  131,  195,  256,  320,
+ 257,  384,  321,  258,  448,  385,  322,  259,  449,  386,  323,  450,  387,
+ 451,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,  197,  134,
+  71,  198,  135,  199,  260,  324,  261,  388,  325,  262,  452,  389,  326,
+ 263,  453,  390,  327,  454,  391,  455,    8,   72,    9,  136,   73,   10,
+ 200,  137,   74,   11,  201,  138,   75,  202,  139,  203,  264,  328,  265,
+ 392,  329,  266,  456,  393,  330,  267,  457,  394,  331,  458,  395,  459,
+  12,   76,   13,  140,   77,   14,  204,  141,   78,   15,  205,  142,   79,
+ 206,  143,  207,  268,  332,  269,  396,  333,  270,  460,  397,  334,  271,
+ 461,  398,  335,  462,  399,  463,   16,   80,   17,  144,   81,   18,  208,
+ 145,   82,   19,  209,  146,   83,  210,  147,  211,  272,  336,  273,  400,
+ 337,  274,  464,  401,  338,  275,  465,  402,  339,  466,  403,  467,   20,
+  84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,   87,  214,
+ 151,  215,  276,  340,  277,  404,  341,  278,  468,  405,  342,  279,  469,
+ 406,  343,  470,  407,  471,   24,   88,   25,  152,   89,   26,  216,  153,
+  90,   27,  217,  154,   91,  218,  155,  219,  280,  344,  281,  408,  345,
+ 282,  472,  409,  346,  283,  473,  410,  347,  474,  411,  475,   28,   92,
+  29,  156,   93,   30,  220,  157,   94,   31,  221,  158,   95,  222,  159,
+ 223,  284,  348,  285,  412,  349,  286,  476,  413,  350,  287,  477,  414,
+ 351,  478,  415,  479,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+ 511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,  511,
+   0,   64,    1,  128,   65,    2,  192,  129,   66,    3,  193,  130,   67, // 64x16
+ 194,  131,  195,  256,  320,  257,  384,  321,  258,  448,  385,  322,  259,
+ 449,  386,  323,  450,  387,  451,    4,   68,    5,  132,   69,    6,  196,
+ 133,   70,    7,  197,  134,   71,  198,  135,  199,  512,  576,  513,  640,
+ 577,  514,  704,  641,  578,  515,  705,  642,  579,  706,  643,  707,  260,
+ 324,  261,  388,  325,  262,  452,  389,  326,  263,  453,  390,  327,  454,
+ 391,  455,    8,   72,    9,  136,   73,   10,  200,  137,   74,   11,  201,
+ 138,   75,  202,  139,  203,  768,  832,  769,  896,  833,  770,  960,  897,
+ 834,  771,  961,  898,  835,  962,  899,  963,  516,  580,  517,  644,  581,
+ 518,  708,  645,  582,  519,  709,  646,  583,  710,  647,  711,  264,  328,
+ 265,  392,  329,  266,  456,  393,  330,  267,  457,  394,  331,  458,  395,
+ 459,   12,   76,   13,  140,   77,   14,  204,  141,   78,   15,  205,  142,
+  79,  206,  143,  207,  772,  836,  773,  900,  837,  774,  964,  901,  838,
+ 775,  965,  902,  839,  966,  903,  967,  520,  584,  521,  648,  585,  522,
+ 712,  649,  586,  523,  713,  650,  587,  714,  651,  715,  268,  332,  269,
+ 396,  333,  270,  460,  397,  334,  271,  461,  398,  335,  462,  399,  463,
+  16,   80,   17,  144,   81,   18,  208,  145,   82,   19,  209,  146,   83,
+ 210,  147,  211,  776,  840,  777,  904,  841,  778,  968,  905,  842,  779,
+ 969,  906,  843,  970,  907,  971,  524,  588,  525,  652,  589,  526,  716,
+ 653,  590,  527,  717,  654,  591,  718,  655,  719,  272,  336,  273,  400,
+ 337,  274,  464,  401,  338,  275,  465,  402,  339,  466,  403,  467,   20,
+  84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,   87,  214,
+ 151,  215,  780,  844,  781,  908,  845,  782,  972,  909,  846,  783,  973,
+ 910,  847,  974,  911,  975,  528,  592,  529,  656,  593,  530,  720,  657,
+ 594,  531,  721,  658,  595,  722,  659,  723,  276,  340,  277,  404,  341,
+ 278,  468,  405,  342,  279,  469,  406,  343,  470,  407,  471,   24,   88,
+  25,  152,   89,   26,  216,  153,   90,   27,  217,  154,   91,  218,  155,
+ 219,  784,  848,  785,  912,  849,  786,  976,  913,  850,  787,  977,  914,
+ 851,  978,  915,  979,  532,  596,  533,  660,  597,  534,  724,  661,  598,
+ 535,  725,  662,  599,  726,  663,  727,  280,  344,  281,  408,  345,  282,
+ 472,  409,  346,  283,  473,  410,  347,  474,  411,  475,   28,   92,   29,
+ 156,   93,   30,  220,  157,   94,   31,  221,  158,   95,  222,  159,  223,
+ 788,  852,  789,  916,  853,  790,  980,  917,  854,  791,  981,  918,  855,
+ 982,  919,  983,  536,  600,  537,  664,  601,  538,  728,  665,  602,  539,
+ 729,  666,  603,  730,  667,  731,  284,  348,  285,  412,  349,  286,  476,
+ 413,  350,  287,  477,  414,  351,  478,  415,  479,  792,  856,  793,  920,
+ 857,  794,  984,  921,  858,  795,  985,  922,  859,  986,  923,  987,  540,
+ 604,  541,  668,  605,  542,  732,  669,  606,  543,  733,  670,  607,  734,
+ 671,  735,  796,  860,  797,  924,  861,  798,  988,  925,  862,  799,  989,
+ 926,  863,  990,  927,  991, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,
+1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023,    0,   64,    1, // 64x32
+ 128,   65,    2,  192,  129,   66,    3,  193,  130,   67,  194,  131,  195,
+ 256,  320,  257,  384,  321,  258,  448,  385,  322,  259,  449,  386,  323,
+ 450,  387,  451,    4,   68,    5,  132,   69,    6,  196,  133,   70,    7,
+ 197,  134,   71,  198,  135,  199,  512,  576,  513,  640,  577,  514,  704,
+ 641,  578,  515,  705,  642,  579,  706,  643,  707,  260,  324,  261,  388,
+ 325,  262,  452,  389,  326,  263,  453,  390,  327,  454,  391,  455,    8,
+  72,    9,  136,   73,   10,  200,  137,   74,   11,  201,  138,   75,  202,
+ 139,  203,  768,  832,  769,  896,  833,  770,  960,  897,  834,  771,  961,
+ 898,  835,  962,  899,  963,  516,  580,  517,  644,  581,  518,  708,  645,
+ 582,  519,  709,  646,  583,  710,  647,  711,  264,  328,  265,  392,  329,
+ 266,  456,  393,  330,  267,  457,  394,  331,  458,  395,  459,   12,   76,
+  13,  140,   77,   14,  204,  141,   78,   15,  205,  142,   79,  206,  143,
+ 207, 1024, 1088, 1025, 1152, 1089, 1026, 1216, 1153, 1090, 1027, 1217, 1154,
+1091, 1218, 1155, 1219,  772,  836,  773,  900,  837,  774,  964,  901,  838,
+ 775,  965,  902,  839,  966,  903,  967,  520,  584,  521,  648,  585,  522,
+ 712,  649,  586,  523,  713,  650,  587,  714,  651,  715,  268,  332,  269,
+ 396,  333,  270,  460,  397,  334,  271,  461,  398,  335,  462,  399,  463,
+  16,   80,   17,  144,   81,   18,  208,  145,   82,   19,  209,  146,   83,
+ 210,  147,  211, 1280, 1344, 1281, 1408, 1345, 1282, 1472, 1409, 1346, 1283,
+1473, 1410, 1347, 1474, 1411, 1475, 1028, 1092, 1029, 1156, 1093, 1030, 1220,
+1157, 1094, 1031, 1221, 1158, 1095, 1222, 1159, 1223,  776,  840,  777,  904,
+ 841,  778,  968,  905,  842,  779,  969,  906,  843,  970,  907,  971,  524,
+ 588,  525,  652,  589,  526,  716,  653,  590,  527,  717,  654,  591,  718,
+ 655,  719,  272,  336,  273,  400,  337,  274,  464,  401,  338,  275,  465,
+ 402,  339,  466,  403,  467,   20,   84,   21,  148,   85,   22,  212,  149,
+  86,   23,  213,  150,   87,  214,  151,  215, 1536, 1600, 1537, 1664, 1601,
+1538, 1728, 1665, 1602, 1539, 1729, 1666, 1603, 1730, 1667, 1731, 1284, 1348,
+1285, 1412, 1349, 1286, 1476, 1413, 1350, 1287, 1477, 1414, 1351, 1478, 1415,
+1479, 1032, 1096, 1033, 1160, 1097, 1034, 1224, 1161, 1098, 1035, 1225, 1162,
+1099, 1226, 1163, 1227,  780,  844,  781,  908,  845,  782,  972,  909,  846,
+ 783,  973,  910,  847,  974,  911,  975,  528,  592,  529,  656,  593,  530,
+ 720,  657,  594,  531,  721,  658,  595,  722,  659,  723,  276,  340,  277,
+ 404,  341,  278,  468,  405,  342,  279,  469,  406,  343,  470,  407,  471,
+  24,   88,   25,  152,   89,   26,  216,  153,   90,   27,  217,  154,   91,
+ 218,  155,  219, 1792, 1856, 1793, 1920, 1857, 1794, 1984, 1921, 1858, 1795,
+1985, 1922, 1859, 1986, 1923, 1987, 1540, 1604, 1541, 1668, 1605, 1542, 1732,
+1669, 1606, 1543, 1733, 1670, 1607, 1734, 1671, 1735, 1288, 1352, 1289, 1416,
+1353, 1290, 1480, 1417, 1354, 1291, 1481, 1418, 1355, 1482, 1419, 1483, 1036,
+1100, 1037, 1164, 1101, 1038, 1228, 1165, 1102, 1039, 1229, 1166, 1103, 1230,
+1167, 1231,  784,  848,  785,  912,  849,  786,  976,  913,  850,  787,  977,
+ 914,  851,  978,  915,  979,  532,  596,  533,  660,  597,  534,  724,  661,
+ 598,  535,  725,  662,  599,  726,  663,  727,  280,  344,  281,  408,  345,
+ 282,  472,  409,  346,  283,  473,  410,  347,  474,  411,  475,   28,   92,
+  29,  156,   93,   30,  220,  157,   94,   31,  221,  158,   95,  222,  159,
+ 223, 1796, 1860, 1797, 1924, 1861, 1798, 1988, 1925, 1862, 1799, 1989, 1926,
+1863, 1990, 1927, 1991, 1544, 1608, 1545, 1672, 1609, 1546, 1736, 1673, 1610,
+1547, 1737, 1674, 1611, 1738, 1675, 1739, 1292, 1356, 1293, 1420, 1357, 1294,
+1484, 1421, 1358, 1295, 1485, 1422, 1359, 1486, 1423, 1487, 1040, 1104, 1041,
+1168, 1105, 1042, 1232, 1169, 1106, 1043, 1233, 1170, 1107, 1234, 1171, 1235,
+ 788,  852,  789,  916,  853,  790,  980,  917,  854,  791,  981,  918,  855,
+ 982,  919,  983,  536,  600,  537,  664,  601,  538,  728,  665,  602,  539,
+ 729,  666,  603,  730,  667,  731,  284,  348,  285,  412,  349,  286,  476,
+ 413,  350,  287,  477,  414,  351,  478,  415,  479, 1800, 1864, 1801, 1928,
+1865, 1802, 1992, 1929, 1866, 1803, 1993, 1930, 1867, 1994, 1931, 1995, 1548,
+1612, 1549, 1676, 1613, 1550, 1740, 1677, 1614, 1551, 1741, 1678, 1615, 1742,
+1679, 1743, 1296, 1360, 1297, 1424, 1361, 1298, 1488, 1425, 1362, 1299, 1489,
+1426, 1363, 1490, 1427, 1491, 1044, 1108, 1045, 1172, 1109, 1046, 1236, 1173,
+1110, 1047, 1237, 1174, 1111, 1238, 1175, 1239,  792,  856,  793,  920,  857,
+ 794,  984,  921,  858,  795,  985,  922,  859,  986,  923,  987,  540,  604,
+ 541,  668,  605,  542,  732,  669,  606,  543,  733,  670,  607,  734,  671,
+ 735, 1804, 1868, 1805, 1932, 1869, 1806, 1996, 1933, 1870, 1807, 1997, 1934,
+1871, 1998, 1935, 1999, 1552, 1616, 1553, 1680, 1617, 1554, 1744, 1681, 1618,
+1555, 1745, 1682, 1619, 1746, 1683, 1747, 1300, 1364, 1301, 1428, 1365, 1302,
+1492, 1429, 1366, 1303, 1493, 1430, 1367, 1494, 1431, 1495, 1048, 1112, 1049,
+1176, 1113, 1050, 1240, 1177, 1114, 1051, 1241, 1178, 1115, 1242, 1179, 1243,
+ 796,  860,  797,  924,  861,  798,  988,  925,  862,  799,  989,  926,  863,
+ 990,  927,  991, 1808, 1872, 1809, 1936, 1873, 1810, 2000, 1937, 1874, 1811,
+2001, 1938, 1875, 2002, 1939, 2003, 1556, 1620, 1557, 1684, 1621, 1558, 1748,
+1685, 1622, 1559, 1749, 1686, 1623, 1750, 1687, 1751, 1304, 1368, 1305, 1432,
+1369, 1306, 1496, 1433, 1370, 1307, 1497, 1434, 1371, 1498, 1435, 1499, 1052,
+1116, 1053, 1180, 1117, 1054, 1244, 1181, 1118, 1055, 1245, 1182, 1119, 1246,
+1183, 1247, 1812, 1876, 1813, 1940, 1877, 1814, 2004, 1941, 1878, 1815, 2005,
+1942, 1879, 2006, 1943, 2007, 1560, 1624, 1561, 1688, 1625, 1562, 1752, 1689,
+1626, 1563, 1753, 1690, 1627, 1754, 1691, 1755, 1308, 1372, 1309, 1436, 1373,
+1310, 1500, 1437, 1374, 1311, 1501, 1438, 1375, 1502, 1439, 1503, 1816, 1880,
+1817, 1944, 1881, 1818, 2008, 1945, 1882, 1819, 2009, 1946, 1883, 2010, 1947,
+2011, 1564, 1628, 1565, 1692, 1629, 1566, 1756, 1693, 1630, 1567, 1757, 1694,
+1631, 1758, 1695, 1759, 1820, 1884, 1821, 1948, 1885, 1822, 2012, 1949, 1886,
+1823, 2013, 1950, 1887, 2014, 1951, 2015, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047,
+2047, 2047, 2047, 2047,    0,   64,    1,  128,   65,    2,  192,  129,   66, // 64x64
+   3,  193,  130,   67,  194,  131,  195,  256,  320,  257,  384,  321,  258,
+ 448,  385,  322,  259,  449,  386,  323,  450,  387,  451,    4,   68,    5,
+ 132,   69,    6,  196,  133,   70,    7,  197,  134,   71,  198,  135,  199,
+ 512,  576,  513,  640,  577,  514,  704,  641,  578,  515,  705,  642,  579,
+ 706,  643,  707,  260,  324,  261,  388,  325,  262,  452,  389,  326,  263,
+ 453,  390,  327,  454,  391,  455,    8,   72,    9,  136,   73,   10,  200,
+ 137,   74,   11,  201,  138,   75,  202,  139,  203,  768,  832,  769,  896,
+ 833,  770,  960,  897,  834,  771,  961,  898,  835,  962,  899,  963,  516,
+ 580,  517,  644,  581,  518,  708,  645,  582,  519,  709,  646,  583,  710,
+ 647,  711,  264,  328,  265,  392,  329,  266,  456,  393,  330,  267,  457,
+ 394,  331,  458,  395,  459,   12,   76,   13,  140,   77,   14,  204,  141,
+  78,   15,  205,  142,   79,  206,  143,  207, 1024, 1088, 1025, 1152, 1089,
+1026, 1216, 1153, 1090, 1027, 1217, 1154, 1091, 1218, 1155, 1219,  772,  836,
+ 773,  900,  837,  774,  964,  901,  838,  775,  965,  902,  839,  966,  903,
+ 967,  520,  584,  521,  648,  585,  522,  712,  649,  586,  523,  713,  650,
+ 587,  714,  651,  715,  268,  332,  269,  396,  333,  270,  460,  397,  334,
+ 271,  461,  398,  335,  462,  399,  463,   16,   80,   17,  144,   81,   18,
+ 208,  145,   82,   19,  209,  146,   83,  210,  147,  211, 1280, 1344, 1281,
+1408, 1345, 1282, 1472, 1409, 1346, 1283, 1473, 1410, 1347, 1474, 1411, 1475,
+1028, 1092, 1029, 1156, 1093, 1030, 1220, 1157, 1094, 1031, 1221, 1158, 1095,
+1222, 1159, 1223,  776,  840,  777,  904,  841,  778,  968,  905,  842,  779,
+ 969,  906,  843,  970,  907,  971,  524,  588,  525,  652,  589,  526,  716,
+ 653,  590,  527,  717,  654,  591,  718,  655,  719,  272,  336,  273,  400,
+ 337,  274,  464,  401,  338,  275,  465,  402,  339,  466,  403,  467,   20,
+  84,   21,  148,   85,   22,  212,  149,   86,   23,  213,  150,   87,  214,
+ 151,  215, 1536, 1600, 1537, 1664, 1601, 1538, 1728, 1665, 1602, 1539, 1729,
+1666, 1603, 1730, 1667, 1731, 1284, 1348, 1285, 1412, 1349, 1286, 1476, 1413,
+1350, 1287, 1477, 1414, 1351, 1478, 1415, 1479, 1032, 1096, 1033, 1160, 1097,
+1034, 1224, 1161, 1098, 1035, 1225, 1162, 1099, 1226, 1163, 1227,  780,  844,
+ 781,  908,  845,  782,  972,  909,  846,  783,  973,  910,  847,  974,  911,
+ 975,  528,  592,  529,  656,  593,  530,  720,  657,  594,  531,  721,  658,
+ 595,  722,  659,  723,  276,  340,  277,  404,  341,  278,  468,  405,  342,
+ 279,  469,  406,  343,  470,  407,  471,   24,   88,   25,  152,   89,   26,
+ 216,  153,   90,   27,  217,  154,   91,  218,  155,  219, 1792, 1856, 1793,
+1920, 1857, 1794, 1984, 1921, 1858, 1795, 1985, 1922, 1859, 1986, 1923, 1987,
+1540, 1604, 1541, 1668, 1605, 1542, 1732, 1669, 1606, 1543, 1733, 1670, 1607,
+1734, 1671, 1735, 1288, 1352, 1289, 1416, 1353, 1290, 1480, 1417, 1354, 1291,
+1481, 1418, 1355, 1482, 1419, 1483, 1036, 1100, 1037, 1164, 1101, 1038, 1228,
+1165, 1102, 1039, 1229, 1166, 1103, 1230, 1167, 1231,  784,  848,  785,  912,
+ 849,  786,  976,  913,  850,  787,  977,  914,  851,  978,  915,  979,  532,
+ 596,  533,  660,  597,  534,  724,  661,  598,  535,  725,  662,  599,  726,
+ 663,  727,  280,  344,  281,  408,  345,  282,  472,  409,  346,  283,  473,
+ 410,  347,  474,  411,  475,   28,   92,   29,  156,   93,   30,  220,  157,
+  94,   31,  221,  158,   95,  222,  159,  223, 1796, 1860, 1797, 1924, 1861,
+1798, 1988, 1925, 1862, 1799, 1989, 1926, 1863, 1990, 1927, 1991, 1544, 1608,
+1545, 1672, 1609, 1546, 1736, 1673, 1610, 1547, 1737, 1674, 1611, 1738, 1675,
+1739, 1292, 1356, 1293, 1420, 1357, 1294, 1484, 1421, 1358, 1295, 1485, 1422,
+1359, 1486, 1423, 1487, 1040, 1104, 1041, 1168, 1105, 1042, 1232, 1169, 1106,
+1043, 1233, 1170, 1107, 1234, 1171, 1235,  788,  852,  789,  916,  853,  790,
+ 980,  917,  854,  791,  981,  918,  855,  982,  919,  983,  536,  600,  537,
+ 664,  601,  538,  728,  665,  602,  539,  729,  666,  603,  730,  667,  731,
+ 284,  348,  285,  412,  349,  286,  476,  413,  350,  287,  477,  414,  351,
+ 478,  415,  479, 1800, 1864, 1801, 1928, 1865, 1802, 1992, 1929, 1866, 1803,
+1993, 1930, 1867, 1994, 1931, 1995, 1548, 1612, 1549, 1676, 1613, 1550, 1740,
+1677, 1614, 1551, 1741, 1678, 1615, 1742, 1679, 1743, 1296, 1360, 1297, 1424,
+1361, 1298, 1488, 1425, 1362, 1299, 1489, 1426, 1363, 1490, 1427, 1491, 1044,
+1108, 1045, 1172, 1109, 1046, 1236, 1173, 1110, 1047, 1237, 1174, 1111, 1238,
+1175, 1239,  792,  856,  793,  920,  857,  794,  984,  921,  858,  795,  985,
+ 922,  859,  986,  923,  987,  540,  604,  541,  668,  605,  542,  732,  669,
+ 606,  543,  733,  670,  607,  734,  671,  735, 1804, 1868, 1805, 1932, 1869,
+1806, 1996, 1933, 1870, 1807, 1997, 1934, 1871, 1998, 1935, 1999, 1552, 1616,
+1553, 1680, 1617, 1554, 1744, 1681, 1618, 1555, 1745, 1682, 1619, 1746, 1683,
+1747, 1300, 1364, 1301, 1428, 1365, 1302, 1492, 1429, 1366, 1303, 1493, 1430,
+1367, 1494, 1431, 1495, 1048, 1112, 1049, 1176, 1113, 1050, 1240, 1177, 1114,
+1051, 1241, 1178, 1115, 1242, 1179, 1243,  796,  860,  797,  924,  861,  798,
+ 988,  925,  862,  799,  989,  926,  863,  990,  927,  991, 1808, 1872, 1809,
+1936, 1873, 1810, 2000, 1937, 1874, 1811, 2001, 1938, 1875, 2002, 1939, 2003,
+1556, 1620, 1557, 1684, 1621, 1558, 1748, 1685, 1622, 1559, 1749, 1686, 1623,
+1750, 1687, 1751, 1304, 1368, 1305, 1432, 1369, 1306, 1496, 1433, 1370, 1307,
+1497, 1434, 1371, 1498, 1435, 1499, 1052, 1116, 1053, 1180, 1117, 1054, 1244,
+1181, 1118, 1055, 1245, 1182, 1119, 1246, 1183, 1247, 1812, 1876, 1813, 1940,
+1877, 1814, 2004, 1941, 1878, 1815, 2005, 1942, 1879, 2006, 1943, 2007, 1560,
+1624, 1561, 1688, 1625, 1562, 1752, 1689, 1626, 1563, 1753, 1690, 1627, 1754,
+1691, 1755, 1308, 1372, 1309, 1436, 1373, 1310, 1500, 1437, 1374, 1311, 1501,
+1438, 1375, 1502, 1439, 1503, 1816, 1880, 1817, 1944, 1881, 1818, 2008, 1945,
+1882, 1819, 2009, 1946, 1883, 2010, 1947, 2011, 1564, 1628, 1565, 1692, 1629,
+1566, 1756, 1693, 1630, 1567, 1757, 1694, 1631, 1758, 1695, 1759, 1820, 1884,
+1821, 1948, 1885, 1822, 2012, 1949, 1886, 1823, 2013, 1950, 1887, 2014, 1951,
+2015, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095,
+4095, 4095, 4095, 4095, 4095,
+};
+
+// Get scan order table based on scan group type (grouped or ungrouped)
+// and log2 block width and height index
+static const uint32_t* const g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] =
+{
+  {
+    { g_scan_order_buffer + 0, g_scan_order_buffer + 1, g_scan_order_buffer + 3, g_scan_order_buffer + 7, g_scan_order_buffer + 15, g_scan_order_buffer + 31, g_scan_order_buffer + 63, },
+    { g_scan_order_buffer + 127, g_scan_order_buffer + 129, g_scan_order_buffer + 133, g_scan_order_buffer + 141, g_scan_order_buffer + 157, g_scan_order_buffer + 189, g_scan_order_buffer + 253, },
+    { g_scan_order_buffer + 381, g_scan_order_buffer + 385, g_scan_order_buffer + 393, g_scan_order_buffer + 409, g_scan_order_buffer + 441, g_scan_order_buffer + 505, g_scan_order_buffer + 633, },
+    { g_scan_order_buffer + 889, g_scan_order_buffer + 897, g_scan_order_buffer + 913, g_scan_order_buffer + 945, g_scan_order_buffer + 1009, g_scan_order_buffer + 1137, g_scan_order_buffer + 1393, },
+    { g_scan_order_buffer + 1905, g_scan_order_buffer + 1921, g_scan_order_buffer + 1953, g_scan_order_buffer + 2017, g_scan_order_buffer + 2145, g_scan_order_buffer + 2401, g_scan_order_buffer + 2913, },
+    { g_scan_order_buffer + 3937, g_scan_order_buffer + 3969, g_scan_order_buffer + 4033, g_scan_order_buffer + 4161, g_scan_order_buffer + 4417, g_scan_order_buffer + 4929, g_scan_order_buffer + 5953, },
+    { g_scan_order_buffer + 8001, g_scan_order_buffer + 8065, g_scan_order_buffer + 8193, g_scan_order_buffer + 8449, g_scan_order_buffer + 8961, g_scan_order_buffer + 9985, g_scan_order_buffer + 12033, },
+  },
+  {
+    { g_scan_order_buffer + 16129, g_scan_order_buffer + 16130, g_scan_order_buffer + 16132, g_scan_order_buffer + 16136, g_scan_order_buffer + 16144, g_scan_order_buffer + 16160, g_scan_order_buffer + 16192, },
+    { g_scan_order_buffer + 16256, g_scan_order_buffer + 16258, g_scan_order_buffer + 16262, g_scan_order_buffer + 16270, g_scan_order_buffer + 16286, g_scan_order_buffer + 16318, g_scan_order_buffer + 16382, },
+    { g_scan_order_buffer + 16510, g_scan_order_buffer + 16514, g_scan_order_buffer + 16522, g_scan_order_buffer + 16538, g_scan_order_buffer + 16570, g_scan_order_buffer + 16634, g_scan_order_buffer + 16762, },
+    { g_scan_order_buffer + 17018, g_scan_order_buffer + 17026, g_scan_order_buffer + 17042, g_scan_order_buffer + 17074, g_scan_order_buffer + 17138, g_scan_order_buffer + 17266, g_scan_order_buffer + 17522, },
+    { g_scan_order_buffer + 18034, g_scan_order_buffer + 18050, g_scan_order_buffer + 18082, g_scan_order_buffer + 18146, g_scan_order_buffer + 18274, g_scan_order_buffer + 18530, g_scan_order_buffer + 19042, },
+    { g_scan_order_buffer + 20066, g_scan_order_buffer + 20098, g_scan_order_buffer + 20162, g_scan_order_buffer + 20290, g_scan_order_buffer + 20546, g_scan_order_buffer + 21058, g_scan_order_buffer + 22082, },
+    { g_scan_order_buffer + 24130, g_scan_order_buffer + 24194, g_scan_order_buffer + 24322, g_scan_order_buffer + 24578, g_scan_order_buffer + 25090, g_scan_order_buffer + 26114, g_scan_order_buffer + 28162, },
+  }
+};
+
+
+/**
+ * \brief Return array of scan order indices.
+ *
+ * \param scan_group  Scan group type, normal or coefficient.
+ * \param scan_type   Scan type, diagonal, horizontal or vertical
+ * \param log2_w      Log2 of block width.
+ * \param log2_h      Log2 of block height.
+ *
+ * \return  Returns pointer to scan order table based on given dimensions.
+ */
+const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h)
+{
+  // TODO: horizontal and vertical scan types
+  assert(scan_type == SCAN_DIAG && "Horizontal and vertical scan not implemented.");
+
+  if (scan_group == SCAN_GROUP_4X4) {
+    return g_scan_order[scan_group][log2_w][log2_h];
+  }
+  else {
+    if (log2_w <= 1 || log2_h <= 1) {
+      // Just return array containing [0, 15] in order
+      return g_scan_order[scan_group][0][4];
+    }
+    else {
+      return g_scan_order[scan_group][log2_w - 2][log2_h - 2];
+    }
+  }
+}
diff --git a/src/tables.h b/src/tables.h
index 1ab81cfb..44621251 100644
--- a/src/tables.h
+++ b/src/tables.h
@@ -134,6 +134,15 @@ typedef enum
  */
 extern const uint32_t* const uvg_g_sig_last_scan[3][5];
 extern const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1];
+extern const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1];
 extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
 
+#define SCAN_GROUP_TYPES 2
+#define MAX_LOG2_INDEX 7
+
+#define SCAN_GROUP_UNGROUPED 0
+#define SCAN_GROUP_4X4 1
+
+const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
+
 #endif //TABLES_H_
diff --git a/src/transform.c b/src/transform.c
index c0adc121..98728da0 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -34,8 +34,10 @@
 
 #include "encode_coding_tree.h"
 #include "image.h"
+#include "intra.h"
 #include "uvg266.h"
 #include "lfnst_tables.h"
+#include "rate_control.h"
 #include "rdo.h"
 #include "strategies/strategies-dct.h"
 #include "strategies/strategies-quant.h"
@@ -77,6 +79,7 @@ const uint8_t uvg_g_chroma_scale[58]=
  * Parameters pred_in and rec_out may be aliased.
  *
  * \param width       Transform width.
+ * \param height      Transform height.
  * \param in_stride   Stride for ref_in and pred_in
  * \param out_stride  Stride for rec_out.
  * \param ref_in      Reference pixels.
@@ -87,6 +90,7 @@ const uint8_t uvg_g_chroma_scale[58]=
  * \returns  Whether coeff_out contains any non-zero coefficients.
  */
 static bool bypass_transquant(const int width,
+                              const int height,
                               const int in_stride,
                               const int out_stride,
                               const uvg_pixel *const ref_in,
@@ -96,7 +100,7 @@ static bool bypass_transquant(const int width,
 {
   bool nonzero_coeffs = false;
 
-  for (int y = 0; y < width; ++y) {
+  for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       int32_t in_idx    = x + y * in_stride;
       int32_t out_idx   = x + y * out_stride;
@@ -123,6 +127,7 @@ static bool bypass_transquant(const int width,
  * \param coeff   coefficients (residual) to filter
  */
 static void rdpcm(const int width,
+                  const int height,
                   const rdpcm_dir dir,
                   coeff_t *coeff)
 {
@@ -130,7 +135,7 @@ static void rdpcm(const int width,
   const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
   const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;
 
-  for (int y = width - 1; y >= min_y; y--) {
+  for (int y = height - 1; y >= min_y; y--) {
     for (int x = width - 1; x >= min_x; x--) {
       const int index = x + y * width;
       coeff[index] -= coeff[index - offset];
@@ -171,19 +176,26 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con
 */
 void uvg_derive_lfnst_constraints(
   cu_info_t* const pred_cu,
-  const int depth,
   bool* constraints,
   const coeff_t* coeff,
   const int width,
-  const int height)
+  const int height,
+  const vector2d_t * const lcu_px,
+  color_t color)
 {
-  coeff_scan_order_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+  coeff_scan_order_t scan_idx = SCAN_DIAG;
   // ToDo: large block support in VVC?
 
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t* scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
 
   signed scan_pos_last = -1;
+  coeff_t temp[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  if(lcu_px != NULL) {
+    uvg_get_sub_coeff(temp, coeff, lcu_px->x, lcu_px->y, width, height, color == COLOR_Y? LCU_WIDTH : LCU_WIDTH_C);
+    coeff = temp;
+  }
 
   for (int i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
@@ -203,17 +215,18 @@ void uvg_derive_lfnst_constraints(
 
 /**
  * \brief NxN inverse transform (2D)
- * \param coeff input data (transform coefficients)
- * \param block output data (residual)
- * \param block_size input data (width of transform)
+ * \param coeff   input data (transform coefficients)
+ * \param block   output data (residual)
+ * \param width   transform width
+ * \param height  transform height
  */
-void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size)
+void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height)
 {
-  int32_t  j,k;
-  for (j = 0; j < block_size; j++) {
-    for(k = 0; k < block_size; k ++) {
+  int32_t j, k;
+  for (j = 0; j < height; j++) {
+    for(k = 0; k < width; k ++) {
       // Casting back and forth to make UBSan not trigger due to left-shifting negatives
-      coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k]));
+      coeff[j * width + k] = (int16_t)((uint16_t)(block[j * width + k]));
     }
   }
 }
@@ -224,12 +237,12 @@ void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,i
  * \param block output data (residual)
  * \param block_size width of transform
  */
-void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size)
+void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_width, int8_t block_height)
 {
   int32_t  j,k;
-  for ( j = 0; j < block_size; j++ ) {
-    for(k = 0; k < block_size; k ++) {
-      block[j * block_size + k] =  coeff[j * block_size + k];
+  for ( j = 0; j < block_height; j++ ) {
+    for(k = 0; k < block_width; k ++) {
+      block[j * block_width + k] =  coeff[j * block_width + k];
     }
   }
 }
@@ -243,17 +256,18 @@ void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                      color_t color,
                      const cu_info_t *tu)
 {
-  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx)
+  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx || block_width != block_height)
   {
-    uvg_mts_dct(encoder->bitdepth, color, tu, block_size, block, coeff, encoder->cfg.mts);
+    uvg_mts_dct(encoder->bitdepth, color, tu, block_width, block_height, block, coeff, encoder->cfg.mts);
   }
   else
   {
-    dct_func *dct_func = uvg_get_dct_func(block_size, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(block_width, block_height, color, tu->type);
     dct_func(encoder->bitdepth, block, coeff);
   }
 }
@@ -261,17 +275,18 @@ void uvg_transform2d(const encoder_control_t * const encoder,
 void uvg_itransform2d(const encoder_control_t * const encoder,
                       int16_t *block,
                       int16_t *coeff,
-                      int8_t block_size,
+                      int8_t block_width,
+                      int8_t block_height,
                       color_t color,
                       const cu_info_t *tu)
 {
-  if (encoder->cfg.mts)
+  if (encoder->cfg.mts || block_width != block_height)
   {
-    uvg_mts_idct(encoder->bitdepth, color, tu, block_size, coeff, block, encoder->cfg.mts);
+    uvg_mts_idct(encoder->bitdepth, color, tu, block_width, block_height, coeff, block, encoder->cfg.mts);
   }
   else
   {
-    dct_func *idct_func = uvg_get_idct_func(block_size, color, tu->type);
+    dct_func *idct_func = uvg_get_idct_func(block_width, block_height, color, tu->type);
     idct_func(encoder->bitdepth, coeff, block);
   }
 }
@@ -348,7 +363,7 @@ static void generate_jccr_transforms(
         }
       }
     }
-    costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1;
+    costs[jccr] = jccr == 0 ? MIN(d1, d2) : d1;
   }
   int64_t min_dist1 = costs[0];
   int64_t min_dist2 = INT64_MAX;
@@ -373,6 +388,7 @@ static void generate_jccr_transforms(
       &temp_resi[(cbf_mask1 - 1) * trans_offset],
       &u_coeff[*num_transforms * trans_offset],
       width,
+      height,
       COLOR_U,
       pred_cu
     );
@@ -386,6 +402,7 @@ static void generate_jccr_transforms(
       &temp_resi[(cbf_mask2 - 1) * trans_offset],
       &u_coeff[*num_transforms * trans_offset],
       width,
+      height,
       COLOR_U,
       pred_cu
     );
@@ -401,27 +418,96 @@ static void generate_jccr_transforms(
 
 static void quantize_chroma(
   encoder_state_t* const state,
-  int depth,
-  int8_t width,
-  int8_t height,
+  cu_info_t * const cur_tu,
+  const cu_loc_t* const cu_loc,
   coeff_t u_coeff[5120],
   coeff_t v_coeff[2048],
-  enum uvg_chroma_transforms transforms[5],
-  const int trans_offset,
-  int i,
+  enum uvg_chroma_transforms transform,
   coeff_t u_quant_coeff[1024],
   coeff_t v_quant_coeff[1024],
   const coeff_scan_order_t scan_order,
   bool* u_has_coeffs,
   bool* v_has_coeffs,
-  uint8_t lfnst_idx)
+  uint8_t lfnst_idx,
+  enum uvg_tree_type tree_type,
+  double* u_coeff_cost,
+  double* v_coeff_cost)
 {
+  int8_t width = cu_loc->chroma_width;
+  int8_t height = cu_loc->chroma_height;
+  if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) {
+    int abs_sum = 0;
+    state->quant_blocks[2].needs_init |= state->encoder_control->cfg.jccr;
+    uvg_dep_quant(
+      state,
+      cur_tu,
+      width,
+      height,
+      u_coeff,
+      u_quant_coeff,
+      COLOR_U,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list
+    );
+
+    cbf_clear(&cur_tu->cbf, COLOR_U);
+    if (abs_sum > 0) {
+      *u_has_coeffs = 1;
+      cbf_set(&cur_tu->cbf, COLOR_U);
+    }
+
+    *u_coeff_cost = uvg_get_coeff_cost(
+      state,
+      u_quant_coeff,
+      cur_tu,
+      cu_loc,
+      COLOR_U,
+      SCAN_DIAG,
+      false,
+      COEFF_ORDER_LINEAR);
+
+    if (transform == DCT7_CHROMA) {
+      abs_sum = 0;
+      state->rate_estimator[2].needs_init = true;
+      uvg_dep_quant(
+        state,
+        cur_tu,
+        width,
+        height,
+        v_coeff,
+        v_quant_coeff,
+        COLOR_V,
+        tree_type,
+        &abs_sum,
+        state->encoder_control->cfg.scaling_list
+      );
+
+      cbf_clear(&cur_tu->cbf, COLOR_V);
+      if (abs_sum > 0) {
+        *v_has_coeffs = 1;
+        cbf_set(&cur_tu->cbf, COLOR_V);
+      }
+
+      *v_coeff_cost = uvg_get_coeff_cost(
+        state,
+        v_quant_coeff,
+        cur_tu,
+        cu_loc,
+        COLOR_V,
+        SCAN_DIAG,
+        false,
+        COEFF_ORDER_LINEAR);
+      cbf_clear(&cur_tu->cbf, COLOR_U);
+      cbf_clear(&cur_tu->cbf, COLOR_V);
+    }
+    return;
+  }
   if (state->encoder_control->cfg.rdoq_enable &&
-    (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
+    (transform != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
   {
-    uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-             scan_order, CU_INTRA, depth, 0,
-             lfnst_idx);
+    uvg_rdoq(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V,
+             scan_order, CU_INTRA, 0, lfnst_idx, 0);
 
     int j;
     for (j = 0; j < width * height; ++j) {
@@ -431,26 +517,25 @@ static void quantize_chroma(
       }
     }
 
-    if (transforms[i] == DCT7_CHROMA) {
+    if (transform == DCT7_CHROMA) {
       uint16_t temp_cbf = 0;
-      if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
-      uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-               scan_order, CU_INTRA, depth, temp_cbf,
-               lfnst_idx);
+      if (*u_has_coeffs)cbf_set(&temp_cbf, COLOR_U);
+      uvg_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V,
+               scan_order, CU_INTRA, temp_cbf, lfnst_idx, 0);
 
     }
   }
-  else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) {
-    uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U, scan_order);
-    uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V, scan_order);
+  else if (state->encoder_control->cfg.rdoq_enable && transform == CHROMA_TS) {
+    uvg_ts_rdoq(state, u_coeff, u_quant_coeff, width, height, COLOR_U, scan_order);
+    uvg_ts_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V, scan_order);
   }
   else {
-    uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-      scan_order, CU_INTRA, transforms[i] == CHROMA_TS, lfnst_idx);
+    uvg_quant(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V,
+      scan_order, CU_INTRA, transform == CHROMA_TS, lfnst_idx);
 
-    if (!IS_JCCR_MODE(transforms[i])) {
-      uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-        scan_order, CU_INTRA, transforms[i] == CHROMA_TS, lfnst_idx);
+    if (!IS_JCCR_MODE(transform)) {
+      uvg_quant(state, v_coeff, v_quant_coeff, width, height, COLOR_V,
+        scan_order, CU_INTRA, transform == CHROMA_TS, lfnst_idx);
     }
   }
 
@@ -460,7 +545,7 @@ static void quantize_chroma(
       break;
     }
   }
-  if (!IS_JCCR_MODE(transforms[i])) {
+  if (!IS_JCCR_MODE(transform)) {
     for (int j = 0; j < width * height; ++j) {
       if (v_quant_coeff[j]) {
         *v_has_coeffs = 1;
@@ -472,13 +557,10 @@ static void quantize_chroma(
 
 void uvg_chroma_transform_search(
   encoder_state_t* const state,
-  int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
-  int8_t width,
-  int8_t height,
+  const cu_loc_t* const cu_loc,
   const int offset,
-  const uint8_t mode,
   cu_info_t* pred_cu,
   uvg_pixel u_pred[1024],
   uvg_pixel v_pred[1024],
@@ -489,13 +571,18 @@ void uvg_chroma_transform_search(
 {
   ALIGNED(64) coeff_t u_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 5];
   ALIGNED(64) uint8_t u_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
-  ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
+  ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2]; // In case of JCCR the v channel does not have coefficients
   ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
+  const int width  = cu_loc->chroma_width;
+  const int height = cu_loc->chroma_height;
+
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+
   uvg_transform2d(
-    state->encoder_control, u_resi, u_coeff, width, COLOR_U, pred_cu
+    state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
   );
   uvg_transform2d(
-    state->encoder_control, v_resi, v_coeff, width, COLOR_V, pred_cu
+    state->encoder_control, v_resi, v_coeff, width, height, COLOR_V, pred_cu
   );
   enum uvg_chroma_transforms transforms[5];
   transforms[0] = DCT7_CHROMA;
@@ -508,8 +595,8 @@ void uvg_chroma_transform_search(
     pred_cu->cr_lfnst_idx == 0 ;
 
   if (can_use_tr_skip) {
-    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width);
-    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width);
+    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width, height);
+    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width, height);
     transforms[num_transforms] = CHROMA_TS;
     num_transforms++;
   }
@@ -526,6 +613,9 @@ void uvg_chroma_transform_search(
       trans_offset,
       &num_transforms);
   }
+
+  double lambda = state->c_lambda;
+
   chorma_ts_out->best_u_cost = MAX_DOUBLE;
   chorma_ts_out->best_v_cost = MAX_DOUBLE;
   chorma_ts_out->best_combined_cost = MAX_DOUBLE;
@@ -537,58 +627,76 @@ void uvg_chroma_transform_search(
     coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
     int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
     int16_t v_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
-    const coeff_scan_order_t scan_order =
-      uvg_get_scan_order(pred_cu->type, mode, depth);
     bool u_has_coeffs = false;
     bool v_has_coeffs = false;
+    bool is_jccr = IS_JCCR_MODE(transforms[i]);
     if(pred_cu->cr_lfnst_idx) {
-      uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
-      if (!IS_JCCR_MODE(transforms[i])) {
-        uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
+      uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
+      if (!is_jccr) {
+        uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
       }
     }
+    uint8_t old_jccr = pred_cu->joint_cb_cr;
+    pred_cu->joint_cb_cr = 0;
+    if(is_jccr) {
+      state->c_lambda = lambda *  (transforms[i] == JCCR_3 ? 0.5 : 0.8);
+      pred_cu->joint_cb_cr = transforms[i];
+    }
+    else if(state->encoder_control->cfg.dep_quant) {
+      state->search_cabac.update = 1;
+    }
+
+    double u_coeff_cost = 0;
+    double v_coeff_cost = 0;
+    unsigned ssd_u = 0;
+    unsigned ssd_v = 0;
+    double   u_bits = 0;
+    double   v_bits = 0;
+
     quantize_chroma(
       state,
-      depth,
-      width,
-      height,
-      u_coeff,
-      v_coeff,
-      transforms,
-      trans_offset,
-      i,
+      pred_cu,
+      cu_loc,
+      &u_coeff[i * trans_offset],
+      &v_coeff[i * trans_offset],
+      transforms[i],
       u_quant_coeff,
       v_quant_coeff,
-      scan_order,
+      SCAN_DIAG,
       &u_has_coeffs,
-      &v_has_coeffs,
-      pred_cu->cr_lfnst_idx);
+      &v_has_coeffs, tree_type == UVG_CHROMA_T ?  pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx, 
+      tree_type,
+      &u_coeff_cost,
+      &v_coeff_cost);
+    pred_cu->joint_cb_cr = old_jccr;
+    if (pred_cu->cr_lfnst_idx != 0 && !u_has_coeffs && !v_has_coeffs) goto reset_cabac;
     
-    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
+    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) {
       bool constraints[2] = { false, false };
-      uvg_derive_lfnst_constraints(pred_cu, depth, constraints, u_quant_coeff, width, height);
-      if(!IS_JCCR_MODE(transforms[i])) {
-        uvg_derive_lfnst_constraints(pred_cu, depth, constraints, v_quant_coeff, width, height);
+      uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
+      if(!is_jccr) {
+        uvg_derive_lfnst_constraints(pred_cu, constraints, v_quant_coeff, width, height, NULL, COLOR_V);
       }
-      if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) continue;
+      if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) goto reset_cabac;
     }
 
-    if (IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;
+    if (is_jccr && !u_has_coeffs) goto reset_cabac;
 
     if (u_has_coeffs) {
-
-      uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
+      uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
+
       if (transforms[i] != CHROMA_TS) {
         if (pred_cu->cr_lfnst_idx) {
-          uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
+          uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
         }
-        uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width,
+        uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height,
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
       }
       else {
-        uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width);
+        uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height);
       }
+
       if (transforms[i] != JCCR_1) {
         for (int j = 0; j < width * height; j++) {
           u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((uvg_pixel)(u_pred[j] + u_recon_resi[j]));
@@ -603,24 +711,28 @@ void uvg_chroma_transform_search(
     else {
       uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width);
     }
-    if (v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) {
-      uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
+
+
+    if (v_has_coeffs && !is_jccr) {
+      uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, height, COLOR_V,
         pred_cu->type, transforms[i] == CHROMA_TS);
+
       if (transforms[i] != CHROMA_TS) {
         if (pred_cu->cr_lfnst_idx) {
-          uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
+          uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode);
         }
-        uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width,
+        uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height,
           transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu);
       }
       else {
-        uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width);
+        uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height);
       }
+
       for (int j = 0; j < width * height; j++) {
         v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
       }
     }
-    else if (u_has_coeffs && IS_JCCR_MODE(transforms[i])) {
+    else if (u_has_coeffs && is_jccr) {
       if (transforms[i] == JCCR_1) {
         for (int j = 0; j < width * height; j++) {
           v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + u_recon_resi[j]);
@@ -641,19 +753,17 @@ void uvg_chroma_transform_search(
       uvg_pixels_blit(v_pred, &v_recon[trans_offset * i], width, height, width, width);
     }
 
-    unsigned ssd_u = 0;
-    unsigned ssd_v = 0;
     if (!state->encoder_control->cfg.lossless) {
       ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
         LCU_WIDTH_C, width,
-        width);
+        width, height);
       ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
         LCU_WIDTH_C, width,
-        width);
+        width, height);
+      ssd_u = (double)ssd_u * state->chroma_weights[1];
+      ssd_v = (double)ssd_v * state->chroma_weights[2];
     }
 
-    double u_bits = 0;
-    double v_bits = 0;
     state->search_cabac.update = 1;
 
     int cbf_u = transforms[i] & 2 || (u_has_coeffs && !(transforms[i] & 1));
@@ -677,33 +787,40 @@ void uvg_chroma_transform_search(
           transforms[i] == CHROMA_TS, u_bits, "tr_skip_u"
         );
       }
-      double coeff_cost = uvg_get_coeff_cost(
-        state,
-        u_quant_coeff,
-        pred_cu,
-        width,
-        COLOR_U,
-        scan_order,
-        transforms[i] == CHROMA_TS);
-      u_bits += coeff_cost;
+      if(u_coeff_cost == 0) {
+        u_coeff_cost = uvg_get_coeff_cost(
+          state,
+          u_quant_coeff,
+          pred_cu,
+          cu_loc,
+          COLOR_U,
+          SCAN_DIAG,
+          transforms[i] == CHROMA_TS,
+          COEFF_ORDER_LINEAR);
+      }
     }
-    if (cbf_v && !IS_JCCR_MODE(transforms[i])) {
+    if (cbf_v && !is_jccr) {
       if (can_use_tr_skip) {
         CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma,
           transforms[i] == CHROMA_TS, v_bits, "tr_skip_v"
         );
       }
-      v_bits += uvg_get_coeff_cost(
-        state,
-        v_quant_coeff,
-        pred_cu,
-        width,
-        COLOR_V,
-        scan_order,
-        transforms[i] == CHROMA_TS);
+      if (v_coeff_cost == 0) {
+        v_coeff_cost = uvg_get_coeff_cost(
+          state,
+          v_quant_coeff,
+          pred_cu,
+          cu_loc,
+          COLOR_V,
+          SCAN_DIAG,
+          transforms[i] == CHROMA_TS,
+          COEFF_ORDER_LINEAR);
+      }
     }
+    u_bits += u_coeff_cost;
+    v_bits += v_coeff_cost;
     if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && 0) {
-      if(uvg_is_lfnst_allowed(state, pred_cu, width, height, 0, 0 , UVG_CHROMA_T, COLOR_UV, lcu)) {
+      if(uvg_is_lfnst_allowed(state, pred_cu, UVG_CHROMA_T, COLOR_UV, cu_loc, lcu)) {
         const int lfnst_idx = pred_cu->cr_lfnst_idx;
         CABAC_FBITS_UPDATE(
           &state->search_cabac,
@@ -723,25 +840,35 @@ void uvg_chroma_transform_search(
       pred_cu->lfnst_last_scan_pos = false;
       pred_cu->violates_lfnst_constrained_chroma = false;
     }
-    if (!IS_JCCR_MODE(transforms[i])) {
-      double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->frame->lambda;
-      double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->frame->lambda;
+
+    if (!is_jccr) {
+      double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->lambda;
+      double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->lambda;
       if (u_cost < chorma_ts_out->best_u_cost) {
         chorma_ts_out->best_u_cost = u_cost;
         chorma_ts_out->best_u_index = u_has_coeffs ? transforms[i] : NO_RESIDUAL;
+        chorma_ts_out->u_bits = u_bits;
+        chorma_ts_out->u_distortion = ssd_u;
       }
       if (v_cost < chorma_ts_out->best_v_cost) {
         chorma_ts_out->best_v_cost = v_cost;
         chorma_ts_out->best_v_index = v_has_coeffs ? transforms[i] : NO_RESIDUAL;
+        chorma_ts_out->v_bits = v_bits;
+        chorma_ts_out->v_distortion = ssd_v;
       }
     }
     else {
-      double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->frame->lambda;
-      if (cost < chorma_ts_out->best_combined_cost) {
+      double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->lambda;
+      if (cost < chorma_ts_out->best_combined_cost && cost < chorma_ts_out->best_u_cost + chorma_ts_out->best_v_cost) {
         chorma_ts_out->best_combined_cost = cost;
         chorma_ts_out->best_combined_index = transforms[i];
+        chorma_ts_out->u_bits              = u_bits;
+        chorma_ts_out->u_distortion        = ssd_u;
+        chorma_ts_out->v_bits              = v_bits;
+        chorma_ts_out->v_distortion        = ssd_v;
       }
     }
+reset_cabac:
     memcpy(&state->search_cabac, temp_cabac, sizeof(cabac_data_t));
   }
 }
@@ -786,12 +913,52 @@ void uvg_fwd_lfnst_NxN(coeff_t *src, coeff_t *dst, const int8_t mode, const int8
   }
 }
 
-static inline bool get_transpose_flag(const int8_t intra_mode)
+static uint32_t get_lfnst_intra_mode(int mode)
+{
+  uint32_t intraMode;
+
+  if (mode < 0)
+  {
+    intraMode = (uint32_t)(mode + (NUM_EXT_LUMA_MODE >> 1) + NUM_LUMA_MODE);
+  }
+  else if (mode >= NUM_LUMA_MODE)
+  {
+    intraMode = (uint32_t)(mode + (NUM_EXT_LUMA_MODE >> 1));
+  }
+  else
+  {
+    intraMode = (uint32_t)mode;
+  }
+
+  return intraMode;
+}
+
+static bool get_transpose_flag(const int8_t intra_mode)
 {
   return ((intra_mode >= NUM_LUMA_MODE) && (intra_mode >= (NUM_LUMA_MODE + (NUM_EXT_LUMA_MODE >> 1)))) ||
          ((intra_mode < NUM_LUMA_MODE) && (intra_mode > DIA_IDX));
 }
 
+
+static inline bool block_is_mip(const cu_info_t * const cur_cu, const color_t color, const bool is_sep_tree)
+{
+  if (cur_cu->type == CU_INTRA) {
+    if (color == COLOR_Y) {
+      return cur_cu->intra.mip_flag;
+    }
+    else {
+      // MIP_TODO: currently, only chroma 420 is supported. Therefore this will always return false
+
+      //bool derived_mode = cur_cu->intra.mode_chroma == (!cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0);
+      //bool is_chroma_mip = !is_sep_tree /*&& chroma_format == CHROMA_444*/ && cur_cu->intra.mip_flag;
+      //return is_chroma_mip && derived_mode;
+
+      return false;
+    }
+  }
+  return false;
+}
+
 void uvg_fwd_lfnst(
   const cu_info_t* const cur_cu,
   const int width,
@@ -799,42 +966,47 @@ void uvg_fwd_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t *coeffs,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode)
 {
   const uint16_t lfnst_index = lfnst_idx;
+  const uint32_t log2_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
-  bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
-  const int depth = cur_cu->depth;
-  bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T; 
+  bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y;
+  // This check is safe for 8x16 cus split with TT, since it is checking the dimensions of the
+  // last luma CU which will be 8x4, i.e., 3 + 2 < 6
+  bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
-  bool is_mip = cur_cu->type == CU_INTRA ? cur_cu->intra.mip_flag : false;
-  bool is_wide_angle = false; // TODO: get wide angle mode when implemented
+  bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
+  
+  const int scan_order = SCAN_DIAG;
 
-  const int cu_type = cur_cu->type;
-
-  const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth);
-
-  if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y))
+  if (lfnst_index && !mts_skip && (color == COLOR_Y || is_separate_tree))
   {
-    const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-    assert(log2_block_size != -1 && "LFNST: invalid block width.");
+    assert(log2_width != -1 && "LFNST: invalid block width.");
     const bool whge3 = width >= 8 && height >= 8;
-    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1];
+    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
 
     if (is_cclm_mode) {
-      intra_mode = cur_cu->intra.mode;
+      intra_mode = luma_mode;
     }
-    if (is_mip) {
+    if (is_mip && color == COLOR_Y) {
       intra_mode = 0; // Set to planar mode
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
-
-    if (is_wide_angle) {
-      // Transform wide angle mode to intra mode
-      intra_mode = intra_mode; // TODO: wide angle modes not implemented yet. Do nothing.
-    }
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(
+      intra_mode, 
+      color == COLOR_Y ? cur_cu->log2_width : log2_width,
+      color == COLOR_Y ? cur_cu->log2_height : log2_height,
+      true
+      );
+    
+    // Transform wide angle mode to intra mode
+    intra_mode = get_lfnst_intra_mode(wide_adjusted_mode);
+  
 
     bool transpose = get_transpose_flag(intra_mode);
     const int sb_size = whge3 ? 8 : 4;
@@ -933,44 +1105,45 @@ void uvg_inv_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t *coeffs,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode)
 {
   // In VTM, max log2 dynamic range is something in range [15, 20] depending on whether extended precision processing is enabled
   // Such is not yet present in uvg266 so use 15 for now
   const int max_log2_dyn_range = 15;
   const uint32_t  lfnst_index = lfnst_idx;
+  const uint32_t log2_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_height = uvg_g_convert_to_log2[height];
   int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma;
-  bool mts_skip = cur_cu->tr_idx == MTS_SKIP;
-  const int depth = cur_cu->depth;
-  bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T;
+  bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y;
+  bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T;
   bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]
 
-  bool is_mip = cur_cu->type == CU_INTRA && tree_type != UVG_CHROMA_T ? cur_cu->intra.mip_flag : false;
-  bool is_wide_angle = false; // TODO: get wide angle mode when implemented
-
-  const int cu_type = cur_cu->type;
-
-  const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth);
+  bool is_mip = block_is_mip(cur_cu, color, is_separate_tree);
+  const int scan_order = SCAN_DIAG;
   
-  if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) {
-    const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  if (lfnst_index && !mts_skip && (color == COLOR_Y || is_separate_tree)) {
     const bool whge3 = width >= 8 && height >= 8;
-    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1];
+    const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1];
     
     if (is_cclm_mode) {
-      intra_mode = cur_cu->intra.mip_flag ? 0 : cur_cu->intra.mode;
+      intra_mode = luma_mode;
     }
-    if (is_mip) {
+    if (is_mip && color == COLOR_Y) {
       intra_mode = 0; // Set to planar mode
     }
     assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode.");
     assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]");
+    int32_t wide_adjusted_mode = uvg_wide_angle_correction(
+      intra_mode, 
+      color == COLOR_Y ? cur_cu->log2_width : log2_width,
+      color == COLOR_Y ? cur_cu->log2_height : log2_height,
+      true
+      );
 
-    if (is_wide_angle) {
-      // Transform wide angle mode to intra mode
-      intra_mode = intra_mode; // TODO: wide angle modes not implemented yet. Do nothing.
-    }
-
+    
+    intra_mode = get_lfnst_intra_mode(wide_adjusted_mode); 
+    
     bool          transpose_flag = get_transpose_flag(intra_mode);
     const int     sb_size = whge3 ? 8 : 4;
     bool          tu_4x4_flag = (width == 4 && height == 4);
@@ -1053,7 +1226,7 @@ void uvg_inv_lfnst(
  */
 int uvg_quantize_residual_trskip(
     encoder_state_t *const state,
-    const cu_info_t *const cur_cu, const int width, const color_t color,
+    const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
     const coeff_scan_order_t scan_order, int8_t *trskip_out, 
     const int in_stride, const int out_stride,
     const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, 
@@ -1074,7 +1247,7 @@ int uvg_quantize_residual_trskip(
   //noskip.cost += uvg_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;
 
   skip.has_coeffs = uvg_quantize_residual(
-    state, cur_cu, width, color, scan_order,
+    state, cur_cu, width, height, color, scan_order,
     1, in_stride, width,
     ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj, 
     UVG_BOTH_T /* tree type doesn't matter for transformskip*/);
@@ -1090,13 +1263,15 @@ int uvg_quantize_residual_trskip(
   if (best->has_coeffs || rec_out != pred_in) {
     // If there is no residual and reconstruction is already in rec_out, 
     // we can skip this.
-    uvg_pixels_blit(best->rec, rec_out, width, width, width, out_stride);
+    uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride);
   }
-  copy_coeffs(best->coeff, coeff_out, width);
+  // TODO: copying coeffs here is very suspect
+  copy_coeffs(best->coeff, coeff_out, width, height, width);
 
   return best->has_coeffs;
 }
 
+
 /**
  * Calculate the residual coefficients for a single TU.
  *
@@ -1105,14 +1280,15 @@ int uvg_quantize_residual_trskip(
 static void quantize_tr_residual(
   encoder_state_t * const state,
   const color_t color,
-  const int32_t x,
-  const int32_t y,
-  const uint8_t depth,
+  const cu_loc_t *cu_loc,
   cu_info_t *cur_pu,
   lcu_t* lcu,
   bool early_skip,
   enum uvg_tree_type tree_type)
 {
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+
   const uvg_config *cfg    = &state->encoder_control->cfg;
   const int32_t shift      = color == COLOR_Y ? 0 : 1;
   const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
@@ -1120,9 +1296,10 @@ static void quantize_tr_residual(
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
   // left PU because the coordinates are correct.
   bool handled_elsewhere = color != COLOR_Y &&
-                           depth == MAX_DEPTH &&
+                           cur_pu->log2_width + cur_pu-> log2_height < 6&&
                            (x % 4 != 0 || y % 4 != 0);
   if (handled_elsewhere) {
+    assert(0);
     return;
   }
 
@@ -1130,44 +1307,44 @@ static void quantize_tr_residual(
   // This should ensure that the CBF data doesn't get corrupted if this function
   // is called more than once.
 
-  int32_t tr_width;
-  if (color == COLOR_Y) {
-    tr_width = LCU_WIDTH >> depth;
-  } else {
-    const int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth);
-    tr_width = LCU_WIDTH_C >> chroma_depth;
-  }
+  const int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  
   const int32_t lcu_width = LCU_WIDTH >> shift;
   const int8_t mode =
     (color == COLOR_Y) ? cur_pu->intra.mode : cur_pu->intra.mode_chroma;
-  const coeff_scan_order_t scan_idx =
-    uvg_get_scan_order(cur_pu->type, mode, depth);
+  
+  const coeff_scan_order_t scan_idx = SCAN_DIAG; 
   const int offset = lcu_px.x + lcu_px.y * lcu_width;
-  const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
+  //const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
 
   // Pointers to current location in arrays with prediction. The
   // reconstruction will be written to this array.
   uvg_pixel *pred = NULL;
   // Pointers to current location in arrays with reference.
   const uvg_pixel *ref = NULL;
-  // Pointers to current location in arrays with quantized coefficients.
-  coeff_t *coeff = NULL;
+  // Temp coeff array
+  coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  coeff_t *dst_coeff = NULL;
 
   switch (color) {
     case COLOR_Y:
-      pred  = &lcu->rec.y[offset];
-      ref   = &lcu->ref.y[offset];
-      coeff = &lcu->coeff.y[z_index];
+      pred      = &lcu->rec.y[offset];
+      ref       = &lcu->ref.y[offset];
+      dst_coeff = &lcu->coeff.y[lcu_px.x + lcu_px.y * lcu_width];
       break;
     case COLOR_U:
-      pred = &lcu->rec.u[offset];
-      ref  = &lcu->ref.u[offset];
-      coeff = &lcu->coeff.u[z_index];
+      pred      = &lcu->rec.u[offset];
+      ref       = &lcu->ref.u[offset];
+      dst_coeff = &lcu->coeff.u[lcu_px.x + lcu_px.y * lcu_width];
       break;
     case COLOR_V:
-      pred = &lcu->rec.v[offset];
-      ref  = &lcu->ref.v[offset];
-      coeff = &lcu->coeff.v[z_index];
+      pred      = &lcu->rec.v[offset];
+      ref       = &lcu->ref.v[offset];
+      dst_coeff = &lcu->coeff.v[lcu_px.x + lcu_px.y * lcu_width];
+      break;
+    case COLOR_UV:
+      dst_coeff = &lcu->coeff.joint_uv[lcu_px.x + lcu_px.y * lcu_width];
       break;
     default:
       break;
@@ -1187,6 +1364,7 @@ static void quantize_tr_residual(
 
   if (cfg->lossless) {
     has_coeffs = bypass_transquant(tr_width,
+                                   tr_height,
                                    lcu_width, // in stride
                                    lcu_width, // out stride
                                    ref,
@@ -1196,9 +1374,9 @@ static void quantize_tr_residual(
     if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
       // implicit rdpcm for horizontal and vertical intra modes
       if (mode == 18) {
-        rdpcm(tr_width, RDPCM_HOR, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_HOR, coeff);
       } else if (mode == 50) {
-        rdpcm(tr_width, RDPCM_VER, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_VER, coeff);
       }
     }
 
@@ -1209,6 +1387,7 @@ static void quantize_tr_residual(
     has_coeffs = uvg_quantize_residual_trskip(state,
                                               cur_pu,
                                               tr_width,
+                                              tr_height,
                                               color,
                                               scan_idx,
                                               &tr_skip,
@@ -1225,24 +1404,37 @@ static void quantize_tr_residual(
         state,
         cur_pu,
         tr_width,
+        tr_height,
         scan_idx,
         lcu_width,
         lcu_width,
         &lcu->ref.u[offset], &lcu->ref.v[offset],
         &lcu->rec.u[offset], &lcu->rec.v[offset],
         &lcu->rec.u[offset], &lcu->rec.v[offset],
-        &lcu->coeff.joint_uv[z_index],
+        coeff,
         early_skip,
         lmcs_chroma_adj,
         tree_type
       );
       cur_pu->joint_cb_cr = has_coeffs;
+      if (has_coeffs) {
+        for (int j = 0; j < tr_height; ++j) {
+          memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
+        }
+        cbf_set(&cur_pu->cbf, COLOR_U);
+      }
+      else {
+        for (int j = 0; j < tr_height; ++j) {
+          memset(&dst_coeff[j * lcu_width], 0, (sizeof(coeff_t) * tr_width));
+        }
+      }
       return;
     }
 
     has_coeffs = uvg_quantize_residual(state,
                                        cur_pu,
                                        tr_width,
+                                       tr_height,
                                        color,
                                        scan_idx,
                                        false, // tr skip
@@ -1258,11 +1450,18 @@ static void quantize_tr_residual(
     
   }
 
-  cbf_clear(&cur_pu->cbf, depth, color);
+  cbf_clear(&cur_pu->cbf, color);
   if (has_coeffs) {
-    cbf_set(&cur_pu->cbf, depth, color);
+    for (int j = 0; j < tr_height; ++j) {
+      memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t));
+    }
+    cbf_set(&cur_pu->cbf, color);
+  }
+  else {
+    for (int j = 0; j < tr_height; ++j) {
+      memset(&dst_coeff[j * lcu_width], 0, (sizeof(coeff_t) * tr_width));
+    }
   }
-
 }
 
 /**
@@ -1287,15 +1486,17 @@ void uvg_quantize_lcu_residual(
   const bool luma,
   const bool chroma,
   const bool jccr,
-  const int32_t x,
-  const int32_t y,
-  const uint8_t depth,
+  const cu_loc_t * cu_loc,
   cu_info_t *cur_pu,
   lcu_t* lcu,
   bool early_skip,
   enum uvg_tree_type tree_type)
 {
-  const int32_t width = LCU_WIDTH >> depth;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+  
   const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
 
   if (cur_pu == NULL) {
@@ -1304,7 +1505,10 @@ void uvg_quantize_lcu_residual(
 
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
-  assert(width ==  4 ||
+  // Width 2 is possible with ISP blocks // ISP_TODO: no, they actually are not
+  assert(width ==  1 ||
+         width ==  2 ||
+         width ==  4 ||
          width ==  8 ||
          width == 16 ||
          width == 32 ||
@@ -1312,54 +1516,79 @@ void uvg_quantize_lcu_residual(
 
   // Reset CBFs because CBFs might have been set
   // for depth earlier
+  // ISP_TODO: does this cur_cu point to the correct place when ISP is used for small blocks?
   if (luma) {
-    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
+    cbf_clear(&cur_pu->cbf, COLOR_Y);
   }
   if (chroma || jccr) {
-    cbf_clear(&cur_pu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_pu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_pu->cbf, COLOR_U);
+    cbf_clear(&cur_pu->cbf, COLOR_V);
   }
 
-  if (depth == 0 || cur_pu->tr_depth > depth) {
-
-    // Split transform and increase depth
-    const int offset = width / 2;
-    const int32_t x2 = x + offset;
-    const int32_t y2 = y + offset;
-
-    // jccr is currently not supported if transform is split
-    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x,  y, depth + 1, NULL, lcu, early_skip, tree_type);
-    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2,  y, depth + 1, NULL, lcu, early_skip, tree_type);
-    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x, y2, depth + 1, NULL, lcu, early_skip, tree_type);
-    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip, tree_type);
-
-    // Propagate coded block flags from child CUs to parent CU.
-    uint16_t child_cbfs[3] = {
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
-      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
-    };
-
-    if (depth <= MAX_DEPTH) {
-      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
-      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U);
-      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V);
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
+    enum split_type split;
+    if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) {
+      split = QT_SPLIT;
     }
+    else if (cu_loc->width > TR_MAX_WIDTH) {
+      split = BT_VER_SPLIT;
+    }
+    else {
+      split = BT_HOR_SPLIT;
+    }
+
+    cu_loc_t split_cu_loc[4];
+    uint16_t child_cbfs[3];
+    const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL);
+    
+    for (int i = 0; i < split_count; ++i) {
+      uvg_quantize_lcu_residual(state, luma, chroma, 0, &split_cu_loc[i], NULL, lcu, early_skip, tree_type);
+      if(i != 0) {
+        child_cbfs[i - 1] = LCU_GET_CU_AT_PX(lcu, split_cu_loc[i].local_x, split_cu_loc[i].local_y)->cbf;
+      }
+    }
+
+    
+    cur_pu->root_cbf = cbf_is_set_any(cur_pu->cbf)
+      || cbf_is_set_any(child_cbfs[0])
+      || cbf_is_set_any(child_cbfs[1])
+      || cbf_is_set_any(child_cbfs[2]);
+    
 
   } else {
     // Process a leaf TU.
+    cu_loc_t loc;
+    uvg_cu_loc_ctor(&loc, x, y, width, height);
+
     if (luma) {
-      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu, early_skip, tree_type);
+      quantize_tr_residual(state, COLOR_Y, &loc, cur_pu, lcu, early_skip, tree_type);
     }
+    double c_lambda = state->c_lambda;
+    state->c_lambda = uvg_calculate_chroma_lambda(state, state->encoder_control->cfg.jccr, cur_pu->joint_cb_cr);
     if (chroma) {
-      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip, tree_type);
-      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip, tree_type);   
+      state->rate_estimator[2].needs_init = true;
+      if(state->encoder_control->cfg.dep_quant) {
+        cabac_data_t temp_cabac;
+        memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
+        state->search_cabac.update = 1;
+        quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type);
+        cu_loc_t temp_chroma_loc;
+        uvg_cu_loc_ctor(&temp_chroma_loc, (cu_loc->x >> 1) % LCU_WIDTH_C, (cu_loc->y >> 1) % LCU_WIDTH_C, cu_loc->width, cu_loc->height);
+        uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &temp_chroma_loc, COLOR_U, 0, (cur_pu->tr_skip & 2) >> 1, COEFF_ORDER_CU);
+        quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type);
+        memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
+      }
+      else {
+        quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type);
+        quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type);
+      }
     }
-    if (jccr && cur_pu->tr_depth == cur_pu->depth) {
-      quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip, tree_type);
+    if (jccr && PU_IS_TU(cur_pu)) {
+      quantize_tr_residual(state, COLOR_UV, &loc, cur_pu, lcu, early_skip, tree_type);
     }
-    if(chroma && jccr && cur_pu->tr_depth == cur_pu->depth) {
+    if(chroma && jccr && PU_IS_TU(cur_pu)) {
       assert( 0 && "Trying to quantize both jccr and regular at the same time.\n");
     }
+    state->c_lambda = c_lambda;
   }
 }
diff --git a/src/transform.h b/src/transform.h
index d3f44edf..be485f46 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -44,23 +44,28 @@
 #include "global.h" // IWYU pragma: keep
 
 extern const uint8_t uvg_g_chroma_scale[58];
-extern const int16_t uvg_g_inv_quant_scales[6];
-extern const int16_t uvg_g_quant_scales[6];
+extern const int16_t uvg_g_inv_quant_scales[2][6];
+extern const int16_t uvg_g_quant_scales[2][6];
 
-void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
-void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
+#define COEFF_ORDER_LINEAR 0
+#define COEFF_ORDER_CU 1
+
+void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
+void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                      color_t color,
                      const cu_info_t *tu);
 
 void uvg_itransform2d(const encoder_control_t * const encoder,
                       int16_t *block,
                       int16_t *coeff,
-                      int8_t block_size,
+                      int8_t block_width,
+                      int8_t block_height,
                       color_t color,
                       const cu_info_t *tu);
 
@@ -69,11 +74,12 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con
 
 void uvg_derive_lfnst_constraints(
   cu_info_t* const pred_cu,
-  const int depth,
   bool* constraints,
   const coeff_t* coeff,
   const int width,
-  const int height);
+  const int height,
+  const vector2d_t * const ,
+  color_t color);
 
 typedef struct {
   double best_u_cost;
@@ -82,6 +88,10 @@ typedef struct {
   int best_u_index;
   int best_v_index;
   int best_combined_index;
+  uint64_t u_distortion;
+  uint64_t v_distortion;
+  double   u_bits;
+  double   v_bits;
 } uvg_chorma_ts_out_t;
 
 void uvg_quantize_lcu_residual(
@@ -89,9 +99,7 @@ void uvg_quantize_lcu_residual(
   bool luma,
   bool chroma,
   const bool jccr,
-  int32_t x,
-  int32_t y,
-  uint8_t depth,
+  const cu_loc_t* cu_loc,
   cu_info_t *cur_cu,
   lcu_t* lcu,
   bool early_skip,
@@ -99,13 +107,10 @@ void uvg_quantize_lcu_residual(
 
 void uvg_chroma_transform_search(
   encoder_state_t* const state,
-  int depth,
   lcu_t* const lcu,
   cabac_data_t* temp_cabac,
-  int8_t width,
-  int8_t height,
+  const cu_loc_t* const cu_loc,
   const int offset,
-  const uint8_t mode,
   cu_info_t* pred_cu,
   uvg_pixel u_pred[1024],
   uvg_pixel v_pred[1024],
@@ -130,7 +135,8 @@ void uvg_fwd_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t *coeffs,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode);
 
 void uvg_inv_lfnst(
   const cu_info_t* cur_cu,
@@ -139,6 +145,7 @@ void uvg_inv_lfnst(
   const color_t color,
   const uint16_t lfnst_idx,
   coeff_t* coeffs,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode);
 
 #endif
diff --git a/src/uvg266.h b/src/uvg266.h
index 3bec7756..c71a835a 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -338,7 +338,6 @@ typedef struct uvg_config
   int32_t trskip_max_size;    /*!< \brief Transform skip max block size. */
   enum uvg_mts mts;        /*< \brief flag to enable multiple transform selection*/
   int32_t mts_implicit;        /*< \brief flag to enable implicit multiple transform selection*/
-  int32_t tr_depth_intra; /*!< \brief Maximum transform depth for intra. */
   enum uvg_ime_algorithm ime_algorithm;  /*!< \brief Integer motion estimation algorithm. */
   int32_t fme_level;      /*!< \brief Fractional pixel motion estimation level (0: disabled, 1: enabled). */
   int8_t source_scan_type; /*!< \brief Source scan type (0: progressive, 1: top field first, 2: bottom field first).*/
@@ -452,7 +451,7 @@ typedef struct uvg_config
   /** \brief Flag to enable/disable open GOP configuration */
   int8_t open_gop;
 
-	int32_t vaq; /** \brief Enable variance adaptive quantization*/
+  int32_t vaq; /** \brief Enable variance adaptive quantization*/
 
   /** \brief Type of scaling lists to use */
   int8_t scaling_list;
@@ -526,6 +525,8 @@ typedef struct uvg_config
   /** \brief enable low frequency non-separable transform */
   int8_t lfnst;
 
+  /** \brief enable intra sub partitions*/
+  int8_t isp;
 
   int8_t jccr;
 
@@ -542,9 +543,16 @@ typedef struct uvg_config
 
   uint8_t dual_tree;
 
+  uint8_t min_qt_size[3]; /* intra, inter, dual tree chroma*/
+  uint8_t max_bt_size[3]; /* intra, inter, dual tree chroma*/
+  uint8_t max_tt_size[3]; /* intra, inter, dual tree chroma*/
+
+  uint8_t max_btt_depth[3]; /* intra, inter, dual tree chroma*/
+
   uint8_t intra_rough_search_levels;
 
   uint8_t ibc; /* \brief Intra Block Copy parameter */
+  uint8_t dep_quant;
 } uvg_config;
 
 /**
diff --git a/src/videoframe.c b/src/videoframe.c
index f5a4d8af..e9a43dc1 100644
--- a/src/videoframe.c
+++ b/src/videoframe.c
@@ -61,7 +61,7 @@ videoframe_t * uvg_videoframe_alloc(int32_t width,
     frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
     if (cclm) {
       assert(chroma_format == UVG_CSP_420);
-      frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+      frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 15) & ~7) + FRAME_PADDING_LUMA) / 4);
       frame->cclm_luma_rec_top_line = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64));
     }
   }
diff --git a/tests/check_cabac_state_consistency.py b/tests/check_cabac_state_consistency.py
index 4d7f970c..73a1dd72 100644
--- a/tests/check_cabac_state_consistency.py
+++ b/tests/check_cabac_state_consistency.py
@@ -30,7 +30,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
     with open(state_file, "rb") as file:
         try:
             while True:
-                type_, x, y, depth, tree_type = file.read(15).decode().split()
+                type_, x, y, depth, tree_type = file.read(23).decode().split()
                 # Reset stored data at the beginning of the frame
                 if x == '0' and y == '0' and type_ == "S" and tree_type != "2":
                     if not was_zero_last:
@@ -38,7 +38,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
                         ctx_store = dict()
                         e_store = set()
                     was_zero_last = True
-                else:
+                elif int(x) >= 64 and int(y) >= 64:
                     was_zero_last = False
 
                 ctx = file.read(ctx_count * ctx_size)
diff --git a/tests/mts_tests.c b/tests/mts_tests.c
index f607b77d..61f9fb2c 100644
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@@ -111,7 +111,8 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
-          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
+          tu.intra.isp_mode = 0;
+          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
         }
       }      
     }
@@ -134,7 +135,8 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
-          idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
+          tu.intra.isp_mode = 0;
+          idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
         }
       }
       
@@ -156,6 +158,7 @@ TEST dct(void)
 {
   char testname[100];
   for (int blocksize = 0; blocksize < NUM_SIZES; blocksize++) {
+    size_t size = 1 << (LCU_MIN_LOG_W + blocksize);
     for (int trafo = 0; trafo < NUM_TRANSFORM; trafo++) {      
       sprintf(testname, "Block: %d x %d, trafo: %d", 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), trafo);
       cu_info_t tu;
@@ -163,14 +166,20 @@ TEST dct(void)
       tu.tr_idx = MTS_DST7_DST7 + trafo;
       tu.lfnst_idx = 0;
       tu.cr_lfnst_idx = 0;
+      tu.intra.isp_mode = 0;
 
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
 
-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
-      for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
-        ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
+      for (int y = 0; y < size; ++y) {
+        if (y>= 16) break;
+        for (int x = 0; x < size; ++x) {
+          if (x >= 16) break;
+          int i = y * size + x;
+          ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
+        }
       }
       //fprintf(stderr, "PASS: %s\r\n", testname);
     }
@@ -188,11 +197,14 @@ TEST idct(void)
       cu_info_t tu;
       tu.type = CU_INTRA;
       tu.tr_idx = MTS_DST7_DST7 + trafo;
+      tu.lfnst_idx = 0;
+      tu.cr_lfnst_idx = 0;
+      tu.intra.isp_mode = 0;
 
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
 
-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
       for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
         ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);
diff --git a/tests/mv_cand_tests.c b/tests/mv_cand_tests.c
index 84ab9328..849fec2d 100644
--- a/tests/mv_cand_tests.c
+++ b/tests/mv_cand_tests.c
@@ -46,8 +46,11 @@ TEST test_get_spatial_merge_cand(void)
 
   merge_candidates_t cand = { 0 };
 
-  get_spatial_merge_candidates(64 + 32, 64, // x, y
-                               32, 24,      // width, height
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, 64 + 32, 64, // x, y
+    32, 24); // width, height)
+
+  get_spatial_merge_candidates(&cu_loc,      
                                1920, 1080,  // picture size
                                &lcu,
                                &cand,
diff --git a/tests/test_cabac_state.sh b/tests/test_cabac_state.sh
index 519f9c40..6d60d1da 100755
--- a/tests/test_cabac_state.sh
+++ b/tests/test_cabac_state.sh
@@ -6,10 +6,10 @@ set -eu
 
 cabacfile="$(mktemp)"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"
 
 rm -rf "${cabacfile}"
diff --git a/tests/test_intra.sh b/tests/test_intra.sh
index 3c37f82b..ea58b415 100755
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@@ -19,3 +19,5 @@ valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
 valgrind_test $common_args --rd=3 --cclm --jccr
 valgrind_test $common_args --lfnst
 valgrind_test $common_args --lfnst --rd=3 --cclm --mip --dual-tree --fast-residual-cost 0
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --fast-residual-cost 0
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra --fast-residual-cost 0
diff --git a/tests/test_mtt.sh b/tests/test_mtt.sh
new file mode 100755
index 00000000..5fc5587b
--- /dev/null
+++ b/tests/test_mtt.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+# Test all-intra coding.
+
+set -eu
+
+. "${0%/*}/util.sh"
+
+common_args='264x130 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-cpuid --no-wpp --fast-residual-cost 0'
+valgrind_test $common_args --rd=0 --mtt-depth-intra 1 --pu-depth-intra 2-3
+valgrind_test $common_args --rd=3 --mtt-depth-intra 1 --pu-depth-intra 0-5
+valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --pu-depth-intra 0-8
+valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8
+valgrind_test $common_args --rd=3 --rdoq --jccr --isp --lfnst --mip --mrl --mts intra --cclm --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8
diff --git a/tools/generate_tables.c b/tools/generate_tables.c
index d50c889f..6bd2497e 100644
--- a/tools/generate_tables.c
+++ b/tools/generate_tables.c
@@ -51,7 +51,7 @@ static void init_sig_last_scan(uint32_t *buff_d, uint32_t *buff_h,
                                uint32_t *buff_v,
                                int32_t width, int32_t height)
 {
-  uint32_t num_scan_pos  = width * width;
+  uint32_t num_scan_pos  = width * height;
   uint32_t next_scan_pos = 0;
   int32_t  xx, yy, x, y;
   uint32_t scan_line;