Merge branch 'release-prep' into master

2024-11-23 18:14:06 +00:00 · 2023-09-27 08:11:09 +03:00 · 2023-09-27 08:11:09 +03:00 · 84580aebb0
parent 1a1fea1a19 4a1cd926fb
commit 84580aebb0
79 changed files with 23515 additions and 3642 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
 list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")

 # Add also all the strategies
-file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
+file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")

 # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC
 list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c")
@ -340,6 +340,9 @@ if(NOT DEFINED MSVC)
  if(NOT "test_external_symbols" IN_LIST XFAIL)
    add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
  endif()
+  if(NOT "test_mtt" IN_LIST XFAIL)
+    add_test( NAME test_mtt COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mtt.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
  if(NOT "test_intra" IN_LIST XFAIL)
    add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
  endif()
--- a/src/cabac.h
+++ b/src/cabac.h
@ -77,6 +77,8 @@ typedef struct
    cabac_ctx_t mts_idx_model[4];
    cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models
    cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models
+    cabac_ctx_t mtt_vertical_model[5]; 
+    cabac_ctx_t mtt_binary_model[4]; 
    cabac_ctx_t intra_luma_mpm_flag_model;    //!< \brief intra mode context models
    cabac_ctx_t intra_subpart_model[2];    //!< \brief intra sub part context models
    cabac_ctx_t chroma_pred_model;
--- a/src/cfg.c
+++ b/src/cfg.c
@ -80,7 +80,6 @@ int uvg_config_init(uvg_config *cfg)
  cfg->trskip_max_size = 2; //Default to 4x4
  cfg->mts             = 0;
  cfg->mts_implicit    = 0;
-  cfg->tr_depth_intra  = 0;
  cfg->ime_algorithm   = 0; /* hexbs */
  cfg->fme_level       = 4;
  cfg->source_scan_type = 0; /* progressive */
@ -207,6 +206,8 @@ int uvg_config_init(uvg_config *cfg)

  cfg->lfnst = false;

+  cfg->isp = false;
+
  parse_qp_map(cfg, 0);

  cfg->jccr = 0;
@ -221,10 +222,27 @@ int uvg_config_init(uvg_config *cfg)
  cfg->cabac_debug_file_name = NULL;

  cfg->dual_tree = 0;
+
+  cfg->min_qt_size[0] = 4;
+  cfg->min_qt_size[1] = 4;
+  cfg->min_qt_size[2] = 4;
+
+  cfg->max_btt_depth[0] = 0;
+  cfg->max_btt_depth[1] = 0;
+  cfg->max_btt_depth[2] = 0;
+
+  cfg->max_tt_size[0] = 64;
+  cfg->max_bt_size[0] = 64;
+  cfg->max_tt_size[1] = 64;
+  cfg->max_bt_size[1] = 64;
+  cfg->max_tt_size[2] = 64;
+  cfg->max_bt_size[2] = 64;
+
  cfg->intra_rough_search_levels = 2;

  cfg->ibc = 0;

+  cfg->dep_quant = 0;
  return 1;
 }

@ -333,7 +351,7 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil

  return 1;
 }
-/*
+
 static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
 {
  char *tail;
@ -349,7 +367,7 @@ static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
    return 1;
  }
 }
-*/
+
 static int parse_int8(const char *numstr,int8_t* number,int min, int max)
 {
  char *tail;
@ -365,7 +383,7 @@ static int parse_int8(const char *numstr,int8_t* number,int min, int max)
    return 1;
  }
 }
-/*
+
 static int parse_array(const char *array, uint8_t *coeff_key, int size,
                            int min, int max)
 {
@ -389,15 +407,15 @@ static int parse_array(const char *array, uint8_t *coeff_key, int size,
    free(key);
    return 0;
  }
-  else if (i<size){
-    fprintf(stderr, "parsing failed : too few members.\n");
+  //else if (i<size){
+  //  fprintf(stderr, "parsing failed : too few members.\n");
+  //  free(key);
+  //  return 0;
+  //}
  free(key);
-    return 0;
+  return i;
 }
-  free(key);
-  return 1;
-}
-*/
+

 static int parse_qp_scale_array(const char *array, int8_t *out)
 {
@ -928,8 +946,6 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
    cfg->mts = mts_type;
    cfg->mts_implicit = (mts_type == UVG_MTS_IMPLICIT);
  }
-  else if OPT("tr-depth-intra")
-    cfg->tr_depth_intra = atoi(value);
  else if OPT("me") {
    int8_t ime_algorithm = 0;
    if (!parse_enum(value, me_names, &ime_algorithm)) return 0;
@ -1454,6 +1470,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
  else if OPT("lfnst") {
    cfg->lfnst = atobool(value);
  }
+  else if OPT("isp") {
+    cfg->isp = atobool(value);
+  }
  else if OPT("jccr") {
    cfg->jccr = (bool)atobool(value);
  }
@ -1479,6 +1498,49 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
  else if OPT("dual-tree") {
    cfg->dual_tree = atobool(value);
  }
+  else if OPT("mtt-depth-intra") {
+    cfg->max_btt_depth[0]  = atoi(value);
+  }
+  else if OPT("mtt-depth-intra-chroma") {
+    cfg->max_btt_depth[2]  = atoi(value);
+  }
+  else if OPT("mtt-depth-inter") {
+    cfg->max_btt_depth[1]  = atoi(value);
+  }
+  else if OPT("max-bt-size") {
+  uint8_t sizes[3];
+  const int got = parse_array(value, sizes, 3, 0, 128);
+    if (got == 1) {
+      cfg->max_bt_size[0] = sizes[0];
+      cfg->max_bt_size[1] = sizes[0];
+      cfg->max_bt_size[2] = sizes[0];
+    }
+    else if (got == 3) {
+      cfg->max_bt_size[0] = sizes[0];
+      cfg->max_bt_size[1] = sizes[1];
+      cfg->max_bt_size[2] = sizes[2];      
+    } else {
+      fprintf(stderr, "Incorrect amount of values provided for max-bt-size\n");
+      return 0;
+    }
+  }
+  else if OPT("max-tt-size") {
+  uint8_t sizes[3];
+  const int got = parse_array(value, sizes, 3, 0, 128);
+    if (got == 1) {
+      cfg->max_tt_size[0] = sizes[0];
+      cfg->max_tt_size[1] = sizes[0];
+      cfg->max_tt_size[2] = sizes[0];
+    }
+    else if (got == 3) {
+      cfg->max_tt_size[0] = sizes[0];
+      cfg->max_tt_size[1] = sizes[1];
+      cfg->max_tt_size[2] = sizes[2];      
+    } else {
+      fprintf(stderr, "Incorrect amount of values provided for max-tt-size\n");
+      return 0;
+    }
+  }
  else if OPT("intra-rough-granularity") {
    cfg->intra_rough_search_levels = atoi(value);
  }
@ -1489,7 +1551,11 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
      return 0;
    }
    cfg->ibc = (uint8_t)ibc_value;
-  }  else {
+  }
+  else if OPT("dep-quant") {
+    cfg->dep_quant = (bool)atobool(value);
+  }
+  else {
    return 0;
  }
 #undef OPT
@ -1681,12 +1747,6 @@ int uvg_config_validate(const uvg_config *const cfg)
    error = 1;
  }

-  if (cfg->tr_depth_intra < 0 || cfg->tr_depth_intra > 4) {
-    // range is 0 .. CtbLog2SizeY - Log2MinTrafoSize
-    fprintf(stderr, "Input error: --tr-depth-intra is out of range [0..4]\n");
-    error = 1;
-  }
-
  if (cfg->fme_level != 0 && cfg->fme_level > 4) {
    fprintf(stderr, "Input error: invalid --subme parameter (must be in range 0-4)\n");
    error = 1;
--- a/src/cli.c
+++ b/src/cli.c
@ -76,7 +76,6 @@ static const struct option long_options[] = {
  { "tr-skip-max-size",   required_argument, NULL, 0 },
  { "mts",                required_argument, NULL, 0 },
  { "no-mts",                   no_argument, NULL, 0 },
-  { "tr-depth-intra",     required_argument, NULL, 0 },
  { "me",                 required_argument, NULL, 0 },
  { "subme",              required_argument, NULL, 0 },
  { "source-scan-type",   required_argument, NULL, 0 },
@ -178,6 +177,8 @@ static const struct option long_options[] = {
  { "no-mip",                   no_argument, NULL, 0 },
  { "lfnst",                    no_argument, NULL, 0 },
  { "no-lfnst",                 no_argument, NULL, 0 },
+  { "isp",                      no_argument, NULL, 0 },
+  { "no-isp",                   no_argument, NULL, 0 },
  { "jccr",                     no_argument, NULL, 0 },
  { "no-jccr",                  no_argument, NULL, 0 },
  { "amvr",                     no_argument, NULL, 0 },
@ -191,8 +192,15 @@ static const struct option long_options[] = {
  { "dual-tree",                no_argument, NULL, 0 },
  { "no-dual-tree",             no_argument, NULL, 0 },
  { "cabac-debug-file",   required_argument, NULL, 0 },
+  { "mtt-depth-intra",    required_argument, NULL, 0 },
+  { "mtt-depth-inter",    required_argument, NULL, 0 },
+  { "mtt-depth-intra-chroma", required_argument, NULL, 0 },
+  { "max-bt-size",        required_argument, NULL, 0 },
+  { "max-tt-size",        required_argument, NULL, 0 },
  { "intra-rough-granularity",required_argument, NULL, 0 },
  { "ibc",                required_argument, NULL, 0 },
+  { "dep-quant",                no_argument, NULL, 0 },
+  { "no-dep-quant",             no_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -571,6 +579,7 @@ void print_help(void)
    "                                   - full: Full ALF\n"
    "      --(no-)rdoq            : Rate-distortion optimized quantization [enabled]\n"
    "      --(no-)rdoq-skip       : Skip RDOQ for 4x4 blocks. [disabled]\n"
+    "      --(no-)dep-quant       : Use dependent quantization. [disabled]\n"
    "      --(no-)signhide        : Sign hiding [disabled]\n"
    "      --rd <integer>         : Intra mode search complexity [0]\n"
    "                                   - 0: Skip intra if inter is good enough.\n"
@ -602,14 +611,14 @@ void print_help(void)
    "                                   - 2: + 1/2-pixel diagonal\n"
    "                                   - 3: + 1/4-pixel horizontal and vertical\n"
    "                                   - 4: + 1/4-pixel diagonal\n"
-    "      --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]\n"
-    "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
+    "      --pu-depth-inter <int>-<int> : Maximum and minimum split depths where\n"
+    "                                     inter search is performed 0..8. [0-3]\n"
    "                                   - Accepts a list of values separated by ','\n"
    "                                     for setting separate depths per GOP layer\n"
    "                                     (values can be omitted to use the first\n"
    "                                     value for the respective layer).\n"
-    "      --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]\n"
-    "                                   - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
+    "      --pu-depth-intra <int>-<int> : Maximum and minimum split depths where\n"
+    "                                     intra search is performed 0..8. [1-4]\n"
    "                                   - Accepts a list of values separated by ','\n"
    "                                     for setting separate depths per GOP layer\n"
    "                                     (values can be omitted to use the first\n"
@ -617,6 +626,22 @@ void print_help(void)
    "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
    "                                learning trees, overrides the\n"
    "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --mtt-depth-intra      : Depth of mtt for intra slices 0..3.[0]\n"
+    "      --mtt-depth-intra-chroma : Depth of mtt for chroma dual tree in\n"
+    "                                      intra slices 0..3.[0]\n"
+    "      --mtt-depth-inter      : Depth of mtt for inter slices 0..3.[0]\n"
+    "                              All MTTs are currently experimental and\n"
+    "                              require disabling some avx2 optimizations.\n"
+    "      --max-bt-size          : maximum size for a CU resulting from\n"
+    "                                   a bt split. A singular value shared for all\n"
+    "                                   or a list of three values for the different\n"
+    "                                   slices types (intra, inter, intra-chroma)\n"
+    "                                   can be provided. [64, 64, 32]\n"
+    "      --max-tt-size          : maximum size for a CU resulting from\n"
+    "                                   a tt split. A singular value shared for all\n"
+    "                                   or a list of three values for the different\n"
+    "                                   slices types (intra, inter, intra-chroma)\n"
+    "                                   can be provided. [64, 64, 32]\n"
    "      --intra-rough-granularity : How many levels are used for the\n"
    "                                   logarithmic intra rough search. 0..4\n"
    "                                   With 0 all of the modes are checked \n"
@ -634,7 +659,6 @@ void print_help(void)
    "                               This is mostly for debugging and is not\n"
    "                               guaranteed to produce sensible bitstream or\n"
    "                               work at all. [disabled]\n"
-    "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
    "      --(no-)bipred          : Bi-prediction [disabled]\n"
    "      --cu-split-termination <string> : CU split search termination [zero]\n"
    "                                   - off: Don't terminate early.\n"
@ -671,6 +695,9 @@ void print_help(void)
    "      --(no-)mip             : Enable matrix weighted intra prediction.\n"
    "      --(no-)lfnst           : Enable low frequency non-separable transform.\n"
    "                                 [disabled]\n"
+    "      --(no-)isp             : Enable intra sub partitions. [disabled]\n"
+    "                               Experimental, requires disabling some avx2\n"
+    "                               optimizations.\n"
    "      --mts <string>         : Multiple Transform Selection [off].\n"
    "                               (Currently only implemented for intra\n"
    "                               and has effect only when rd >= 2)\n"
--- a/src/context.c
+++ b/src/context.c
@ -50,6 +50,21 @@ static const uint8_t  INIT_QT_SPLIT_FLAG[4][6] = {
  {   0,   8,   8,  12,  12,   8, },
 };

+
+static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = {
+  {  43,  42,  37,  42,  44, },
+  {  43,  35,  37,  34,  52, },
+  {  43,  42,  29,  27,  44, },
+  {   9,   8,   9,   8,   5, },
+};
+
+static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = {
+  {  28,  29,  28,  29, },
+  {  43,  37,  21,  22, },
+  {  36,  45,  36,  45, },
+  {  12,  13,  12,  13, },
+  };
+
 static const uint8_t INIT_SKIP_FLAG[4][3] = {
  {  57,  60,  46, },
  {  57,  59,  45, },
@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
    uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]);
    uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]);
    uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]);
+    uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]);
+  }
+
+  for (i = 0; i < 5; i++) {
+    uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]);
  }

  for (i = 0; i < 6; i++) {  
@ -618,13 +638,14 @@ void uvg_context_copy(encoder_state_t * const target_state, const encoder_state_
 uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,
                                      uint32_t pos_x,
                                      uint32_t pos_y,
-                                      int32_t width)
+                                      int32_t width,
+                                      int32_t height)
 {
  uint32_t uiRight = 0;
  uint32_t uiLower = 0;
  uint32_t position = pos_y * width + pos_x;
  if (pos_x + 1 < (uint32_t)width) uiRight = sig_coeff_group_flag[position + 1];
-  if (pos_y + 1 < (uint32_t)width) uiLower = sig_coeff_group_flag[position + width];
+  if (pos_y + 1 < (uint32_t)height) uiLower = sig_coeff_group_flag[position + width];

  return uiRight || uiLower;
 }
@ -656,7 +677,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
 * \returns context index for current scan position
 */
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t height, uint32_t width, int8_t type,
+                                         uint32_t width, uint32_t height, int8_t color,
                                         int32_t* temp_diag, int32_t* temp_sum)
 {
  const coeff_t* data = coeff + pos_x + pos_y * width;
@ -686,7 +707,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u
  }
 #undef UPDATE
  int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0);
-  if (type == 0 /* Luma */)
+  if (color == COLOR_Y)
  {
    ctx_ofs += diag < 5 ? 4 : 0;
  }
@ -814,7 +835,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos
 * \returns context go rice parameter
 */
 uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel)
+                             uint32_t width, uint32_t height, uint32_t baselevel)
 {
 #define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/

@ -856,8 +877,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
 * \returns context go rice parameter
 */
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-  uint32_t height, uint32_t width, uint32_t baselevel)
+  uint32_t width, uint32_t height, uint32_t baselevel)
 {
-  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel);
+  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel);
  return  g_go_rice_pars[check];  
 }
--- a/src/context.h
+++ b/src/context.h
@ -49,10 +49,10 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice);

 void uvg_context_copy(encoder_state_t * target_state, const encoder_state_t * source_state);

-uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width);
+uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width, int32_t height);
 uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, int32_t width);
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t height, uint32_t width, int8_t type, 
+                                         uint32_t width, uint32_t height, int8_t type, 
                                         int32_t* temp_diag, int32_t* temp_sum);

 uint32_t uvg_context_get_sig_ctx_idx_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos_y,
@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
                     uint32_t height, uint32_t width, uint32_t baselevel);

 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel);
+                             uint32_t width, uint32_t height, uint32_t baselevel);

 #define CNU 35
 #define DWS 8
--- a/src/cu.c
+++ b/src/cu.c
@ -34,6 +34,9 @@
 #include <stdlib.h>

 #include "cu.h"
+
+#include "alf.h"
+#include "encoderstate.h"
 #include "threads.h"


@ -97,6 +100,42 @@ cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 }


+void uvg_get_isp_cu_arr_coords(int *x, int *y, int dim)
+{
+  // Do nothing if dimensions are divisible by 4
+  if (*y % 4 == 0 && *x % 4 == 0) return;
+  const int remainder_y = *y % 4;
+  const int remainder_x = *x % 4;
+
+  if (remainder_y != 0) {
+    // Horizontal ISP split
+    if (remainder_y % 2 == 0 && dim == 8) {
+      // 8x2 block
+      *y -= 2;
+      *x += 4;
+    }
+    else {
+      // 16x1 block
+      *y -= remainder_y;
+      *x += remainder_y * 4;
+    }
+  }
+  else {
+    // Vertical ISP split
+    if (*x % 2 == 0 && dim == 8) {
+      // 2x8 block
+      *y += 4;
+      *x -= 2;
+    }
+    else {
+      // 1x16 block
+      *y += remainder_x * 4;
+      *x -= remainder_x;
+    }
+  }
+}
+
+
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px)
 {
  assert(x_px < cua->width);
@ -237,10 +276,10 @@ cu_array_t * uvg_cu_array_copy_ref(cu_array_t* cua)
 * \param dst_y   y-coordinate of the top edge of the copied area in dst
 * \param src     source lcu
 */
-void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type tree_type)
+void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src)
 {
  const int dst_stride = dst->stride >> 2;
-  const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C;
+  const int width = LCU_WIDTH;
  for (int y = 0; y < width; y += SCU_WIDTH) {
    for (int x = 0; x < width; x += SCU_WIDTH) {
      const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y);
@ -251,3 +290,215 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
    }
  }
 }
+
+/*
+ * \brief Constructs cu_loc_t based on given parameters. Calculates chroma dimensions automatically.
+ *
+ * \param loc     Destination cu_loc.
+ * \param x       Block top left x coordinate.
+ * \param y       Block top left y coordinate.
+ * \param width   Block width.
+ * \param height  Block height.
+*/
+void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
+{
+  assert(x >= 0 && y >= 0 && width >= 0 && height >= 0 && "Cannot give negative coordinates or block dimensions.");
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Luma CU dimension exceeds maximum (dim > LCU_WIDTH).");
+  // This check is no longer valid. With non-square blocks and ISP enabled, even 1x16 and 16x1 (ISP needs at least 16 samples) blocks are valid
+  //assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4.");
+  
+  loc->x = x;
+  loc->y = y;
+  loc->local_x = x % LCU_WIDTH;
+  loc->local_y = y % LCU_WIDTH;
+  loc->width = width;
+  loc->height = height;
+  // TODO: when MTT is implemented, chroma dimensions can be minimum 2.
+  // Chroma width is half of luma width, when not at maximum depth.
+  loc->chroma_width = width >> 1;
+  loc->chroma_height = height >> 1;
+}
+
+
+int uvg_get_split_locs(
+  const cu_loc_t* const origin,
+  enum split_type split,
+  cu_loc_t out[4],
+  uint8_t* separate_chroma)
+{
+  const int half_width = origin->width >> 1;
+  const int half_height = origin->height >> 1;
+  const int quarter_width = origin->width >> 2;
+  const int quarter_height = origin->height >> 2;
+  if (origin->width == 4 && separate_chroma) *separate_chroma = 1;
+
+  switch (split) {
+    case NO_SPLIT:
+      assert(0 && "trying to get split from no split");
+    break;
+    case QT_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, half_height);
+      uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height);
+      uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height);
+      uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height);
+      if (half_height == 4 && separate_chroma) *separate_chroma = 1;
+      return 4;
+    case BT_HOR_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height);
+      uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height);
+      if (half_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
+      return 2;
+    case BT_VER_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
+      uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
+      if ((half_width == 4 || half_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
+      return 2;
+    case TT_HOR_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
+      uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height);
+      uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height);
+      if (quarter_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
+      return 3;
+    case TT_VER_SPLIT:
+      uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
+      uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
+      uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
+      if ((quarter_width == 4 || quarter_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
+      return 3;
+  }
+  return 0;
+}
+
+
+int uvg_get_implicit_split(
+  const encoder_state_t* const state,
+  const cu_loc_t* const cu_loc,
+  uint8_t max_mtt_depth)
+{
+  bool right_ok = (state->tile->frame->width) >= cu_loc->x + cu_loc->width;
+  bool bottom_ok = (state->tile->frame->height) >= cu_loc->y + cu_loc->height;
+
+  if (right_ok && bottom_ok) return NO_SPLIT;
+  if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT;
+  if (bottom_ok && max_mtt_depth != 0) return BT_VER_SPLIT;
+  return QT_SPLIT;
+}
+
+
+int uvg_get_possible_splits(const encoder_state_t * const state,
+                            const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6])
+{
+  const unsigned width = cu_loc->width;
+  const unsigned height = cu_loc->height;
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
+
+  const unsigned max_btd =
+    state->encoder_control->cfg.max_btt_depth[slice_type] + split_tree.implicit_mtt_depth;
+  const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type];
+  const unsigned min_bt_size = 1 << MIN_SIZE;
+  const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type];
+  const unsigned min_tt_size = 1 << MIN_SIZE;
+  const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
+
+  const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd);
+  
+  splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
+  bool can_btt = split_tree.mtt_depth < max_btd;
+  
+  const enum split_type last_split = GET_SPLITDATA(&split_tree, split_tree.current_depth - 1);
+  const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
+
+  // don't allow QT-splitting below a BT split
+  if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false;
+  if (width <= min_qt_size)                              splits[QT_SPLIT] = false;
+
+  if (tree_type == UVG_CHROMA_T && width <= 8) splits[QT_SPLIT] = false;
+
+  if (implicitSplit != NO_SPLIT)
+  {
+    splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false;
+
+    splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT && height <= max_bt_size;
+    splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT && width <= max_bt_size;
+    if (tree_type == UVG_CHROMA_T && width <= 8) splits[BT_VER_SPLIT] = false;
+    if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true;
+    return 1;
+  }
+
+  if ((last_split == TT_HOR_SPLIT || last_split == TT_VER_SPLIT) && split_tree.part_index == 1)
+  {
+    splits[BT_HOR_SPLIT] = parl_split != BT_HOR_SPLIT;
+    splits[BT_VER_SPLIT] = parl_split != BT_VER_SPLIT;
+  }
+
+  if (can_btt && (width <= min_bt_size && height <= min_bt_size)
+    && ((width <= min_tt_size && height <= min_tt_size)))
+  {
+    can_btt = false;
+  }
+  if (can_btt && (width > max_bt_size || height > max_bt_size)
+    && ((width > max_tt_size || height > max_tt_size)))
+  {
+    can_btt = false;
+  }
+
+  if (!can_btt)
+  {
+    splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
+
+    return 0;
+  }
+
+  if (width > max_bt_size || height > max_bt_size)
+  {
+    splits[BT_HOR_SPLIT] = splits[BT_VER_SPLIT] = false;
+  }
+
+  // specific check for BT splits
+  if (height <= min_bt_size)                            splits[BT_HOR_SPLIT] = false;
+  if (width > 64 && height <= 64) splits[BT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 64)     splits[BT_HOR_SPLIT] = false;
+
+  if (width <= min_bt_size)                              splits[BT_VER_SPLIT] = false;
+  if (width <= 64 && height > 64) splits[BT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 64 || width <= 8))     splits[BT_VER_SPLIT] = false;
+
+  //if (modeType == MODE_TYPE_INTER && width * height == 32)  splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
+
+  if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
+    splits[TT_HOR_SPLIT] = false;
+  if (width > 64 || height > 64)  splits[TT_HOR_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && width * height <= 64 * 2)     splits[TT_HOR_SPLIT] = false;
+
+  if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
+    splits[TT_VER_SPLIT] = false;
+  if (width > 64 || height > 64)  splits[TT_VER_SPLIT] = false;
+  if (tree_type == UVG_CHROMA_T && (width * height <= 64 * 2 || width <= 16))     splits[TT_VER_SPLIT] = false;
+
+  //if (modeType == MODE_TYPE_INTER && width * height == 64)  splits[TT_VER_SPLIT] = splits[TT_HOR_SPLIT] = false;
+  return 0;
+}
+
+
+int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left)
+{
+  if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) {
+    return 0;
+  }
+  if (left && cu_loc->local_x == 0) return (LCU_WIDTH - cu_loc->local_y) / 4;
+  if (!left && cu_loc->local_y == 0) return (cu_loc->width) / 2;
+
+  int amount = left ? cu_loc->height & ~3 : cu_loc->width & ~3;
+  if(left) {
+    const cu_info_t* cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
+    if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu->log2_height == 6 && cu->log2_width == 6) return 8;
+    while (cu_loc->local_y + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET) {
+      amount += TR_MIN_WIDTH;
+    }
+    return MAX(amount / TR_MIN_WIDTH, cu_loc->height / TR_MIN_WIDTH);
+  }
+  while (cu_loc->local_x + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
+    amount += TR_MIN_WIDTH;
+  }
+  return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
+}
--- a/src/cu.h
+++ b/src/cu.h
@ -77,55 +77,6 @@ typedef enum {
  MTS_TR_NUM    = 6,
 } mts_idx;

-extern const uint8_t uvg_part_mode_num_parts[];
-extern const uint8_t uvg_part_mode_offsets[][4][2];
-extern const uint8_t uvg_part_mode_sizes[][4][2];
-
-/**
- * \brief Get the x coordinate of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param cu_x        x coordinate of the containing CU
- * \param i           number of the PU
- * \return            location of the left edge of the PU
- */
-#define PU_GET_X(part_mode, cu_width, cu_x, i) \
-  ((cu_x) + uvg_part_mode_offsets[(part_mode)][(i)][0] * (cu_width) / 4)
-
-/**
- * \brief Get the y coordinate of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param cu_y        y coordinate of the containing CU
- * \param i           number of the PU
- * \return            location of the top edge of the PU
- */
-#define PU_GET_Y(part_mode, cu_width, cu_y, i) \
-  ((cu_y) + uvg_part_mode_offsets[(part_mode)][(i)][1] * (cu_width) / 4)
-
-/**
- * \brief Get the width of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param i           number of the PU
- * \return            width of the PU
- */
-#define PU_GET_W(part_mode, cu_width, i) \
-  (uvg_part_mode_sizes[(part_mode)][(i)][0] * (cu_width) / 4)
-
-/**
- * \brief Get the height of a PU.
- *
- * \param part_mode   partition mode of the containing CU
- * \param cu_width    width of the containing CU
- * \param i           number of the PU
- * \return            height of the PU
- */
-#define PU_GET_H(part_mode, cu_width, i) \
-  (uvg_part_mode_sizes[(part_mode)][(i)][1] * (cu_width) / 4)

 //////////////////////////////////////////////////////////////////////////
 // TYPES
@ -142,24 +93,53 @@ enum uvg_tree_type {
  UVG_CHROMA_T = 2
 };

+enum split_type {
+  NO_SPLIT = 0,
+  QT_SPLIT = 1,
+  BT_HOR_SPLIT = 2,
+  BT_VER_SPLIT = 3,
+  TT_HOR_SPLIT = 4,
+  TT_VER_SPLIT = 5,
+};
+
+typedef struct  {
+  uint32_t split_tree;
+  uint8_t current_depth;
+  uint8_t mtt_depth;
+  uint8_t implicit_mtt_depth;
+  uint8_t part_index;
+} split_tree_t;
+
+
+// Split for each depth takes three bits like xxy where if either x bit is set
+// it is a MTT split, and if there are any MTT split QT split is not allowed
+#define CAN_QT_SPLIT(x) (((x) & 6DB6DB6) == 0)
+
 /**
 * \brief Struct for CU info
 */
 typedef struct
 {
  uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
-  uint8_t depth       : 3; //!< \brief depth / size of this block
-  uint8_t part_size   : 3; //!< \brief partition mode, one of part_mode_t values
-  uint8_t tr_depth    : 3; //!< \brief transform depth
  uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
  uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
  uint8_t merge_idx   : 3; //!< \brief merge index
  uint8_t tr_skip     : 3; //!< \brief transform skip flag
  uint8_t tr_idx      : 3; //!< \brief transform index
-  uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 
+  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 
+
+  uint8_t log2_width : 3;
+  uint8_t log2_height : 3;
+
+  uint8_t log2_chroma_width : 3;
+  uint8_t log2_chroma_height : 3;

  uint16_t cbf;

+  uint8_t root_cbf;
+
+  uint32_t split_tree : 3 * 9;
+
  /**
   * \brief QP used for the CU.
   *
@ -172,12 +152,15 @@ typedef struct
  uint8_t violates_mts_coeff_constraint : 1;
  uint8_t mts_last_scan_pos : 1;

-  uint8_t violates_lfnst_constrained_luma : 1; // Two types, luma and chroma. Luma index is 0.
-  uint8_t violates_lfnst_constrained_chroma : 1; // Two types, luma and chroma. Luma index is 0.
+  uint8_t violates_lfnst_constrained_luma : 1;
+  uint8_t violates_lfnst_constrained_chroma : 1;
  uint8_t lfnst_last_scan_pos : 1;
  uint8_t lfnst_idx : 2;
  uint8_t cr_lfnst_idx : 2;

+  uint8_t luma_deblocking : 2;
+  uint8_t chroma_deblocking : 2;
+
  union {
    struct {
      int8_t mode;
@ -185,6 +168,9 @@ typedef struct
      uint8_t multi_ref_idx;
      int8_t mip_flag;
      int8_t mip_is_transposed;
+      int8_t isp_mode;
+      uint8_t isp_cbfs : 4;
+      uint8_t isp_index : 2;
    } intra;
    struct {
      mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
@ -200,12 +186,25 @@ typedef struct
 typedef struct {
  int16_t x;
  int16_t y;
+  uint8_t local_x;
+  uint8_t local_y;
  int8_t width;
  int8_t height;
  int8_t chroma_width;
  int8_t chroma_height;
 } cu_loc_t;

+void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
+typedef struct encoder_state_t encoder_state_t;
+
+int uvg_get_split_locs(
+  const cu_loc_t* const origin,
+  enum split_type split,
+  cu_loc_t out[4],
+  uint8_t* separate_chroma);
+int uvg_get_possible_splits(const encoder_state_t* const state,
+                            const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]);
+

 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
  (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
@ -219,7 +218,7 @@ typedef struct {
    } \
  } while (0)

-#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
+#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d part_size=%d coded=%d " \
  "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
  "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
  "intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \
@ -227,7 +226,7 @@ typedef struct {
  "intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \
  "inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \
  "inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \
-  , (cu).type, (cu).depth, (cu).part_size, (cu).tr_depth, (cu).coded, \
+  , (cu).type, (cu).part_size, (cu).coded, \
  (cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \
  (cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \
  (cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \
@ -246,6 +245,7 @@ typedef struct cu_array_t {
 } cu_array_t;

 cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
+void uvg_get_isp_cu_arr_coords(int* x, int* y, int dim);
 const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);

 cu_array_t * uvg_cu_array_alloc(const int width, const int height);
@ -382,8 +382,9 @@ typedef struct {
  cu_info_t cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1];
 } lcu_t;

-void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type
-                                tree_type);
+void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src);
+
+int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);

 /**
 * \brief Return pointer to the top right reference CU.
@ -412,9 +413,11 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
 */
 static INLINE void copy_coeffs(const coeff_t *__restrict src,
                               coeff_t *__restrict dest,
-                               size_t width)
+                               size_t width, size_t height, const int lcu_width)
 {
-  memcpy(dest, src, width * width * sizeof(coeff_t));
+  for (int j = 0; j < height; ++j) {
+    memcpy(dest + j * lcu_width, src + j * lcu_width, width * sizeof(coeff_t));
+  }
 }


@ -554,56 +557,52 @@ static INLINE unsigned xy_to_zorder(unsigned width, unsigned x, unsigned y)
 } while(0)


-#define NUM_CBF_DEPTHS 5
-static const uint16_t cbf_masks[NUM_CBF_DEPTHS] = { 0x1f, 0x0f, 0x07, 0x03, 0x1 };
-
 /**
 * Check if CBF in a given level >= depth is true.
 */
-static INLINE int cbf_is_set(uint16_t cbf, int depth, color_t plane)
+static INLINE int cbf_is_set(uint16_t cbf, color_t plane)
 {
-  return (cbf & (cbf_masks[depth] << (NUM_CBF_DEPTHS * plane))) != 0;
+  return (cbf & (1 << (plane))) != 0;
 }

 /**
 * Check if CBF in a given level >= depth is true.
 */
-static INLINE int cbf_is_set_any(uint16_t cbf, int depth)
+static INLINE int cbf_is_set_any(uint16_t cbf)
 {
-  return cbf_is_set(cbf, depth, COLOR_Y) ||
-         cbf_is_set(cbf, depth, COLOR_U) ||
-         cbf_is_set(cbf, depth, COLOR_V);
+  return cbf_is_set(cbf, COLOR_Y) ||
+         cbf_is_set(cbf, COLOR_U) ||
+         cbf_is_set(cbf, COLOR_V);
 }

 /**
 * Set CBF in a level to true.
 */
-static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane)
+static INLINE void cbf_set(uint16_t *cbf, color_t plane)
 {
  // Return value of the bit corresponding to the level.
-  *cbf |= (0x10 >> depth) << (NUM_CBF_DEPTHS * plane);
+  *cbf |= (1) << (plane);
 }

 /**
 * Set CBF in a level to true if it is set at a lower level in any of
 * the child_cbfs.
 */
-static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], int depth, color_t plane)
+static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], color_t plane)
 {
-  bool child_cbf_set = cbf_is_set(child_cbfs[0], depth + 1, plane) ||
-                       cbf_is_set(child_cbfs[1], depth + 1, plane) ||
-                       cbf_is_set(child_cbfs[2], depth + 1, plane);
+  bool child_cbf_set = cbf_is_set(child_cbfs[0], plane) ||
+                       cbf_is_set(child_cbfs[1], plane) ||
+                       cbf_is_set(child_cbfs[2], plane);
  if (child_cbf_set) {
-    cbf_set(cbf, depth, plane);
+    cbf_set(cbf, plane);
  }
 }

 /**
- * Set CBF in a levels <= depth to false.
 */
-static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
+static INLINE void cbf_clear(uint16_t *cbf, color_t plane)
 {
-  *cbf &= ~(cbf_masks[depth] << (NUM_CBF_DEPTHS * plane));
+  *cbf &= ~(1 << (plane));
 }

 /**
@ -611,11 +610,11 @@ static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
 */
 static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
 {
-  cbf_clear(cbf, 0, plane);
-  *cbf |= src & (cbf_masks[0] << (NUM_CBF_DEPTHS * plane));
+  cbf_clear(cbf, plane);
+  *cbf |= src & (1 <<  plane);
 }

-#define GET_SPLITDATA(CU,curDepth) ((CU)->depth > curDepth)
-#define SET_SPLITDATA(CU,flag) { (CU)->split=(flag); }
+#define GET_SPLITDATA(CU,curDepth) ((CU)->split_tree >> ((MAX((curDepth), 0) * 3)) & 7)
+#define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE)

 #endif
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@ -0,0 +1,247 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#ifndef DEP_QUANT_H_
+#define DEP_QUANT_H_
+
+#include "cu.h"
+#include "global.h"
+
+#define SM_NUM_CTX_SETS_SIG   3
+#define SM_NUM_CTX_SETS_GTX   2
+#define SM_MAX_NUM_SIG_SBB_CTX 2
+#define SM_MAX_NUM_SIG_CTX    12
+#define SM_MAX_NUM_GTX_CTX    21
+#define SCALE_BITS         15
+#define RICEMAX            32
+
+typedef struct encoder_control_t encoder_control_t;
+
+enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
+
+struct dep_quant_scan_info
+{
+  uint8_t sig_ctx_offset[2];
+  uint8_t gtx_ctx_offset[2];
+  uint16_t cg_pos;
+  uint16_t  pos_y;
+  uint16_t  pos_x;
+  uint8_t next_sbb_right;
+  uint8_t next_sbb_below;
+};
+
+typedef struct
+{
+  int     m_QShift;
+  int64_t m_QAdd;
+  int64_t m_QScale;
+  int64_t m_maxQIdx;
+  int64_t m_thresLast;
+  int64_t m_thresSSbb;
+  // distortion normalization
+  int     m_DistShift;
+  int64_t m_DistAdd;
+  int64_t m_DistStepAdd;
+  int64_t m_DistOrgFact;
+  bool    needs_init;
+} quant_block;
+
+typedef struct
+{
+  int32_t  m_lastBitsX[TR_MAX_WIDTH];
+  int32_t  m_lastBitsY[TR_MAX_WIDTH];
+  uint32_t m_sigSbbFracBits[SM_MAX_NUM_SIG_SBB_CTX][2];
+  uint32_t m_sigFracBits[SM_NUM_CTX_SETS_SIG][SM_MAX_NUM_SIG_CTX][2];
+  int32_t  m_gtxFracBits[SM_MAX_NUM_GTX_CTX][6];
+  bool     needs_init;
+} rate_estimator_t;
+
+
+typedef struct
+{
+  uint8_t num;
+  uint8_t inPos[5];
+} NbInfoSbb;
+
+typedef struct
+{
+  uint16_t maxDist;
+  uint16_t num;
+  uint16_t outPos[5];
+} NbInfoOut;
+
+typedef struct {
+  int32_t absLevel[4];
+  int64_t deltaDist[4];
+} PQData;
+
+typedef struct {
+  int64_t ALIGNED(32) rdCost[8];
+  int32_t ALIGNED(32) absLevel[8];
+  int32_t ALIGNED(32) prevId[8];
+} Decision;
+
+
+typedef struct {
+  uint8_t* sbbFlags;
+  uint8_t* levels;
+} SbbCtx;
+
+typedef struct {
+  const NbInfoOut* m_nbInfo;
+  uint32_t         m_sbbFlagBits[2][2];
+  SbbCtx           m_allSbbCtx[2];
+  int              m_curr_sbb_ctx_offset;
+  int              m_prev_sbb_ctx_offset;
+  uint8_t          sbb_memory[8 * 1024];
+  uint8_t          level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH];
+  int              num_coeff;
+} common_context;
+
+
+typedef struct {
+  int64_t  m_rdCost;
+  uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          m_numSigSbb;
+  int             m_remRegBins;
+  int8_t          m_refSbbCtxId;
+  uint32_t        m_sbbFracBits[2];
+  uint32_t        m_sigFracBits[2];
+  int32_t         m_coeffFracBits[6];
+  int8_t          m_goRicePar;
+  int8_t          m_goRiceZero;
+  int8_t          m_stateId;
+  uint32_t*       m_sigFracBitsArray[12];
+  int32_t*        m_gtxFracBitsArray[21];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+} depquant_state;
+typedef struct {
+  int64_t  ALIGNED(32) m_rdCost[12];
+  uint8_t  ALIGNED(32) m_absLevels[3][16 * 4]; 
+  uint16_t ALIGNED(32) m_ctxInit[3][16 * 4]; 
+  int8_t          ALIGNED(16) m_numSigSbb[12];
+  int             ALIGNED(32) m_remRegBins[12];
+  int8_t          ALIGNED(16) m_refSbbCtxId[12];
+  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
+  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
+  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
+  int8_t          ALIGNED(16) m_goRicePar[12];
+  int8_t          ALIGNED(16) m_goRiceZero[12];
+  int8_t          ALIGNED(16) m_stateId[12];
+  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
+  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+
+  bool            all_gte_four;
+  bool            all_lt_four;
+} all_depquant_states;
+
+typedef struct {
+  common_context      m_common_context;
+  all_depquant_states m_allStates;
+  int                 m_curr_state_offset;
+  int                 m_prev_state_offset;
+  int                 m_skip_state_offset;
+  depquant_state      m_startState;
+  quant_block*        m_quant;
+  Decision            m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
+} context_store;
+
+
+int uvg_init_nb_info(encoder_control_t* encoder);
+void uvg_dealloc_nb_info(encoder_control_t* encoder);
+
+
+void uvg_dep_quant_dequant(
+  const encoder_state_t* const state,
+  const int block_type,
+  const int width,
+  const int height,
+  const color_t compID,
+  coeff_t* quant_coeff,
+  coeff_t* coeff,
+  bool enableScalingLists);
+
+int uvg_dep_quant(
+  const encoder_state_t* const state,
+  const cu_info_t* const cur_tu,
+  const int width,
+  const int height,
+  const coeff_t* srcCoeff,
+  coeff_t* coeff_out,
+  const color_t compID,
+  enum uvg_tree_type tree_type,
+  int* absSum,
+  const bool enableScalingLists);
+
+
+void uvg_dep_quant_update_state(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id);
+
+
+void uvg_dep_quant_update_state_eos(
+  context_store*  ctxs,
+  const uint32_t  scan_pos,
+  const uint32_t  cg_pos,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const uint32_t  width_in_sbb,
+  const uint32_t  height_in_sbb,
+  const uint32_t  next_sbb_right,
+  const uint32_t  next_sbb_below,
+  const Decision* decisions,
+  int             decision_id);
+
+void uvg_dep_quant_check_rd_costs(
+  const all_depquant_states* const state,
+  const enum ScanPosType           spt,
+  const PQData*                    pqDataA,
+  Decision*                        decisions,
+  const int                        decisionA,
+  const int                        decisionB,
+  const int                        state_offset);
+#endif
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@ -40,30 +40,29 @@
 #include "encoderstate.h"
 #include "global.h"

-bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu);
+bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t*
+                        const cu_loc);
 bool uvg_is_lfnst_allowed(
  const encoder_state_t* const state,
  const cu_info_t* const pred_cu,
-  const int width,
-  const int height,
-  const int x,
-  const int y,
  enum uvg_tree_type tree_type,
  const color_t color,
-  const lcu_t* lcu);
+  const cu_loc_t* const cu_loc, const lcu_t* const lcu);

 void uvg_encode_coding_tree(
  encoder_state_t * const state,
-  uint16_t x_ctb,
-  uint16_t y_ctb,
-  uint8_t depth,
  lcu_coeff_t *coeff,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
+  split_tree_t split_tree,
+  bool has_chroma);

 void uvg_encode_ts_residual(encoder_state_t* const state,
  cabac_data_t* const cabac,
  const coeff_t* coeff,
  uint32_t width,
+  uint32_t height,
  uint8_t type,
  int8_t scan_mode,
  double* bits);
@ -77,41 +76,47 @@ void uvg_encode_mvd(encoder_state_t * const state,
 double uvg_mock_encode_coding_unit(
  encoder_state_t* const state,
  cabac_data_t* cabac,
-  int x,
-  int y,
-  int depth,
+  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const chroma_loc,
  lcu_t* lcu,
  cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree);

-int uvg_encode_inter_prediction_unit(encoder_state_t* const state,
+int uvg_encode_inter_prediction_unit(
+  encoder_state_t* const state,
  cabac_data_t* const cabac,
  const cu_info_t* const cur_cu,
-                                      int x, int y, int width, int height,
-                                      int depth, 
  lcu_t* lcu,
+  double* bits_out,
+  const cu_loc_t* const cu_loc);
+
+void uvg_encode_intra_luma_coding_unit(
+  const encoder_state_t* const state,
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
  double* bits_out);

-void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
-  cabac_data_t* const cabac,
-  const cu_info_t* const cur_cu,
-  int x, int y, int depth, const lcu_t* lcu, double* bits_out);

-
-bool uvg_write_split_flag(
+uint8_t uvg_write_split_flag(
  const encoder_state_t* const state,
  cabac_data_t* cabac,
  const cu_info_t* left_cu,
  const cu_info_t* above_cu,
-  uint8_t split_flag,
-  int depth,
-  int cu_width,
-  int x,
-  int y,
+  const cu_loc_t* const cu_loc,
+  split_tree_t,
  enum uvg_tree_type tree_type,
+  bool* is_implicit_out,
  double* bits_out);

 void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
  uint8_t lastpos_x, uint8_t lastpos_y,
  uint8_t width, uint8_t height,
  uint8_t type, uint8_t scan, double* bits_out);
+
+void uvg_get_sub_coeff(const coeff_t* dst, const coeff_t* const src, 
+                       const int lcu_x, const int lcu_y, 
+                       const int block_w, const int block_h, 
+                       const int lcu_width);
--- a/src/encoder.c
+++ b/src/encoder.c
@ -320,6 +320,13 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
    encoder->scaling_list.use_default_list = 1;
  }

+  if(cfg->dep_quant) {
+    if(!uvg_init_nb_info(encoder)) {
+      fprintf(stderr, "Could not initialize nb info.\n");
+      goto init_failed;      
+    }
+  }
+
  // ROI / delta QP
  if (cfg->roi.file_path) {
    const char *mode[2] = { "r", "rb" };
@ -379,10 +386,6 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
    goto init_failed;
  }
  
-  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
-  // for SMP and AMP partition units.
-  encoder->tr_depth_inter = 0;
-
  //Tiles
  encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                          encoder->cfg.tiles_height_count > 1;
--- a/src/encoder.h
+++ b/src/encoder.h
@ -38,6 +38,7 @@
 * Initialization of encoder_control_t.
 */

+#include "dep_quant.h"
 #include "global.h" // IWYU pragma: keep
 #include "uvg266.h"
 #include "scalinglist.h"
@ -98,6 +99,10 @@ typedef struct encoder_control_t
  //scaling list
  scaling_list_t scaling_list;

+  NbInfoSbb* m_scanId2NbInfoSbbArray[7 + 1][7 + 1];
+  NbInfoOut* m_scanId2NbInfoOutArray[7 + 1][7 + 1];
+  struct dep_quant_scan_info* scan_info[7 + 1][7 + 1];
+
  //spec: references to variables defined in Rec. ITU-T H.265 (04/2013)
  int8_t tiles_enable; /*!<spec: tiles_enabled */

@ -132,8 +137,6 @@ typedef struct encoder_control_t

  FILE *roi_file;

-  int tr_depth_inter;
-
  //! pic_parameter_set
  struct {
    uint8_t dependent_slice_segments_enabled_flag;
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -528,48 +528,31 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
  WRITE_UE(stream, MIN_SIZE-2, "log2_min_luma_coding_block_size_minus2"); // Min size 2^3 = 8x8
  // if(!no_partition_constraints_override_constraint_flag)
    WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
-  WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
-  WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_luma");  
+  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma");
+  if (encoder->cfg.max_btt_depth[0]) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
+  }
  
  if (encoder->chroma_format != UVG_CSP_400)
  {
    WRITE_U(stream, encoder->cfg.dual_tree, 1, "qtbtt_dual_tree_intra_flag");
  }
  if (encoder->cfg.dual_tree) {
-    WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
-    WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
-    if (0 /*sps_max_mtt_hierarchy_depth_intra_slice_chroma != 0*/) {
-      WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
-      WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
+    WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
+    if (encoder->cfg.max_btt_depth[2]) {
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
    }
  }
-  WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_inter_slice");
-  WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_inter_slice");  
-
-
-#if 0 // mtt depth intra
-  if (max_mtt_depth_intra != 0) {
-    WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_luma");
-    WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_luma");
+  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice");
+  if (encoder->cfg.max_btt_depth[1] != 0) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
  }
-#endif
-#if 0 // mtt depth inter
-  if (max_mtt_depth_inter != 0) {
-    WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_inter_tile_group");
-    WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_inter_tile_group");
-  }
-#endif
-#if 0 // Dual Tree
-  if (encoder->cfg.dual_i_tree) {
-    WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_tile_group_chroma");
-    WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_tile_group_chroma");
-
-    if (max_mtt_depth_intra != 0) {
-      WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_chroma");
-      WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_chroma");
-    }
-  }
-#endif

  if (LCU_WIDTH > 32)
    WRITE_U(stream, (TR_MAX_LOG2_SIZE - 5) ? 1 : 0, 1, "sps_max_luma_transform_size_64_flag");
@ -665,7 +648,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,

  WRITE_UE(stream, encoder->cfg.log2_parallel_merge_level-2, "log2_parallel_merge_level_minus2");

-  WRITE_U(stream, 0, 1, "sps_isp_enabled_flag");
+  WRITE_U(stream, encoder->cfg.isp, 1, "sps_isp_enabled_flag");
  
  if (state->encoder_control->cfg.mrl) {
    WRITE_U(stream, 1, 1, "sps_mrl_enabled_flag");
@ -706,7 +689,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,

  WRITE_U(stream, 0, 1, "scaling_list_enabled_flag");

-  WRITE_U(stream, 0, 1, "pic_dep_quant_enabled_flag");
+  WRITE_U(stream, encoder->cfg.dep_quant, 1, "pic_dep_quant_enabled_flag");

  WRITE_U(stream, encoder->cfg.signhide_enable, 1, "pic_sign_data_hiding_enabled_flag");

@ -1142,7 +1125,7 @@ static void uvg_encoder_state_write_bitstream_picture_header(
    WRITE_U(stream, 0, 1, "ph_mvd_l1_zero_flag");
  }

-  if (encoder->cfg.jccr) {
+  if (encoder->cfg.jccr && encoder->chroma_format != UVG_CSP_400) {
    WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag");
  }
  // END PICTURE HEADER
@ -1375,11 +1358,14 @@ void uvg_encoder_state_write_bitstream_slice_header(
  }

  // ToDo: depquant
+  if (encoder->cfg.dep_quant) {
+    WRITE_U(stream, 1, 1, "sh_dep_quant_used_flag");
+  }

-  if (state->encoder_control->cfg.signhide_enable) {
+  if (state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant) {
    WRITE_U(stream, 1, 1, "sh_sign_data_hiding_used_flag");
  }
-  if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable /* && !cfg.dep_quant*/)
+  if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable  && !encoder->cfg.dep_quant)
  {
    // TODO: find out what this is actually about and parametrize it
    WRITE_U(stream, 0, 1, "sh_ts_residual_coding_disabled_flag"); 
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -627,43 +627,52 @@ static void encode_sao(encoder_state_t * const state,
 * \param prev_qp         -1 if QP delta has not been coded in current QG,
 *                        otherwise the QP of the current QG
 */
-static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
+static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const
+                       int depth)
 {

  // Stop recursion if the CU is completely outside the frame.
-  if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
+  if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;

-  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
-  const int cu_width = LCU_WIDTH >> depth;
+  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
+  const int width = 1 << cu->log2_width;

  if (depth <= state->frame->max_qp_delta_depth) {
    *prev_qp = -1;
  }

-  if (cu->depth > depth) {
+  if (cu_loc->width > width) {
    // Recursively process sub-CUs.
-    const int d = cu_width >> 1;
-    set_cu_qps(state, x,     y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x,     y + d, depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp, prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);

  } else {
    bool cbf_found = *prev_qp >= 0;

-    if (cu->tr_depth > depth) {
+    int y_limit = cu_loc->y + cu_loc->height;
+    int x_limit = cu_loc->x + cu_loc->width;
+    if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
      // The CU is split into smaller transform units. Check whether coded
      // block flag is set for any of the TUs.
-      const int tu_width = LCU_WIDTH >> cu->tr_depth;
-      for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
-        for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
+      const int tu_width = MIN(TR_MAX_WIDTH, 1 << cu->log2_width);
+      for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
+        for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
          cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
-          if (cbf_is_set_any(tu->cbf, cu->depth)) {
+          if (cbf_is_set_any(tu->cbf)) {
            cbf_found = true;
          }
        }
      }
-    } else if (cbf_is_set_any(cu->cbf, cu->depth)) {
+    } else if (cbf_is_set_any(cu->cbf)) {
      cbf_found = true;
    }

@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
    if (cbf_found) {
      *prev_qp = qp = cu->qp;
    } else {
-      qp = uvg_get_cu_ref_qp(state, x, y, *last_qp);
+      qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp);
    }

    // Set the correct QP for all state->tile->frame->cu_array elements in
    // the area covered by the CU.
-    for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
-      for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
+    for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) {
+      for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) {
        uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
      }
    }

-    if (is_last_cu_in_qg(state, x, y, depth)) {
+    if (is_last_cu_in_qg(state, cu_loc)) {
      *last_qp = cu->qp;
    }
  }
@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
  if (state->frame->max_qp_delta_depth >= 0) {
    int last_qp = state->last_qp;
    int prev_qp = -1;
-    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
+    cu_loc_t cu_loc;
+    uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH);
+    set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0);
  }

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) {
@ -870,10 +881,16 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)

  enum uvg_tree_type tree_type = state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
  //Encode coding tree
-  uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff, tree_type);
+  cu_loc_t start;
+  uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+  split_tree_t split_tree = { 0, 0, 0, 0, 0 };
+
+  uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);

  if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
-    uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, 0, lcu->coeff, UVG_CHROMA_T);
+    uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
+    cu_loc_t chroma_tree_loc = start;
+    uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &chroma_tree_loc, split_tree, true);
  }

  if (!state->cabac.only_count) {
@ -1152,6 +1169,12 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
          uvg_threadqueue_submit(state->encoder_control->threadqueue, job[0]);

          uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]);
+#ifdef UVG_DEBUG_PRINT_CABAC
+          // Ensures that the ctus are encoded in raster scan order
+          if(i >= state->tile->frame->width_in_lcu) {
+            uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[(lcu->id / state->tile->frame->width_in_lcu - 1) * state->tile->frame->width_in_lcu]);
+          }
+#endif
        }

        uvg_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
@ -1281,13 +1304,13 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
            sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
            sub_state->tile->frame->height_in_lcu * LCU_WIDTH
        );
-        if(main_state->encoder_control->cfg.dual_tree){
+        if(main_state->encoder_control->cfg.dual_tree && main_state->frame->is_irap){
          sub_state->tile->frame->chroma_cu_array = uvg_cu_subarray(
              main_state->tile->frame->chroma_cu_array,
-              offset_x / 2,
-              offset_y / 2,
-              sub_state->tile->frame->width_in_lcu * LCU_WIDTH_C,
-              sub_state->tile->frame->height_in_lcu * LCU_WIDTH_C
+              offset_x,
+              offset_y,
+              sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
+              sub_state->tile->frame->height_in_lcu * LCU_WIDTH
          );
        }
      }
@ -1926,10 +1949,9 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict

  if (cfg->dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 && state->frame->is_irap) {
    assert(state->tile->frame->chroma_cu_array == NULL);
-    state->tile->frame->chroma_cu_array = uvg_cu_array_chroma_alloc(
-      state->tile->frame->width / 2,
-      state->tile->frame->height / 2,
-      state->encoder_control->chroma_format
+    state->tile->frame->chroma_cu_array = uvg_cu_array_alloc(
+      state->tile->frame->width,
+      state->tile->frame->height
    );
  }
  // Set pictype.
@ -2029,9 +2051,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s
 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame)
 {
 #if UVG_DEBUG_PRINT_CABAC == 1
-  uvg_cabac_bins_count = 0;
+  // uvg_cabac_bins_count = 0;
  if (state->frame->num == 0) uvg_cabac_bins_verbose = true;
-  else uvg_cabac_bins_verbose = false;
+  // else uvg_cabac_bins_verbose = false;
 #endif


@ -2193,11 +2215,12 @@ int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
  const cu_array_t *cua = state->tile->frame->cu_array;
  // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_width);
+  const int qg_height = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_height);

  // Coordinates of the top-left corner of the quantization group
  const int x_qg = x & ~(qg_width - 1);
-  const int y_qg = y & ~(qg_width - 1);
+  const int y_qg = y & ~(qg_height - 1);
  if(x_qg == 0 && y_qg > 0 && y_qg % LCU_WIDTH == 0) {
    return uvg_cu_array_at_const(cua, x_qg, y_qg - 1)->qp;
  }
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -332,6 +332,7 @@ typedef struct encoder_state_t {
  int8_t qp;

  double c_lambda;
+  double chroma_weights[4];

  /**
   * \brief Whether a QP delta value must be coded for the current LCU.
@ -359,7 +360,15 @@ typedef struct encoder_state_t {
  //Constraint structure  
  void * constraint;

+  // Since lfnst needs the collocated luma intra mode for
+  // dual tree if the chroma mode is cclm mode and getting all of
+  // the information that would be necessary to get the collocated
+  // luma mode in the lfnst functions, instead store the current
+  // collocated luma mode in the state.
+  int8_t collocated_luma_mode;

+  quant_block quant_blocks[3]; // luma, ISP, chroma
+  rate_estimator_t rate_estimator[4]; // luma, cb, cr, isp
 } encoder_state_t;

 void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
@ -401,14 +410,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
 * \param depth   depth in the CU tree
 * \return true, if it's the last CU in its QG, otherwise false
 */
-static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
+static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc)
 {
  if (state->frame->max_qp_delta_depth < 0) return false;
  
-  const int cu_width = LCU_WIDTH >> depth;
  const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
-  const int right  = x + cu_width;
-  const int bottom = y + cu_width;
+  const int right  = cu_loc->x + cu_loc->width;
+  const int bottom = cu_loc->y + cu_loc->height;
  return (right % qg_width == 0 || right >= state->tile->frame->width) &&
         (bottom % qg_width == 0 || bottom >= state->tile->frame->height);
 }
--- a/src/filter.c
+++ b/src/filter.c
@ -36,6 +36,7 @@

 #include "cu.h"
 #include "encoder.h"
+#include "intra.h"
 #include "uvg266.h"
 #include "transform.h"
 #include "videoframe.h"
@ -269,19 +270,19 @@ static bool is_tu_boundary(
  int32_t x,
  int32_t y,
  edge_dir dir,
+  color_t color,
  enum uvg_tree_type tree_type)
 {
-  x >>= tree_type == UVG_CHROMA_T;
-  y >>= tree_type == UVG_CHROMA_T;
  // if (x & 3 || y & 3) return false;
  const cu_info_t *const scu =
    uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y);
-  const int tu_width = LCU_WIDTH >> (scu->tr_depth + (tree_type == UVG_CHROMA_T));

  if (dir == EDGE_HOR) {
-    return (y & (tu_width - 1)) == 0;
+    return color == COLOR_Y ? scu->luma_deblocking & EDGE_HOR :
+                              scu->chroma_deblocking & EDGE_HOR;
  } else {
-    return (x & (tu_width - 1)) == 0;
+    return color == COLOR_Y ? scu->luma_deblocking & EDGE_VER :
+                              scu->chroma_deblocking & EDGE_VER;
  }
 }

@ -306,32 +307,6 @@ static bool is_pu_boundary(const encoder_state_t *const state,
   it for now, in case some other tool requires it.
  */
  return false;
-  //const cu_info_t *const scu =
-  //  uvg_cu_array_at_const(state->tile->frame->cu_array, x, y);
-  //// Get the containing CU.
-  //const int32_t cu_width = LCU_WIDTH >> scu->depth;
-  //const int32_t x_cu = x & ~(cu_width - 1);
-  //const int32_t y_cu = y & ~(cu_width - 1);
-  //const cu_info_t *const cu =
-  //  uvg_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu);
-
-  //const int num_pu = uvg_part_mode_num_parts[cu->part_size];
-  //for (int i = 0; i < num_pu; i++) {
-  //  if (dir == EDGE_HOR) {
-  //    int y_pu = PU_GET_Y(cu->part_size, cu_width, y_cu, i);
-  //    if (y_pu == y) {
-  //      return true;
-  //    }
-
-  //  } else {
-  //    int x_pu = PU_GET_X(cu->part_size, cu_width, x_cu, i);
-  //    if (x_pu == x) {
-  //      return true;
-  //    }
-  //  }
-  //}
-
-  //return false;
 }


@ -346,9 +321,9 @@ static bool is_pu_boundary(const encoder_state_t *const state,
 static bool is_on_8x8_grid(int x, int y, edge_dir dir)
 {
  if (dir == EDGE_HOR) {
-    return (y & 7) == 0 && (x & 2) == 0;
+    return (y & 7) == 0;
  } else {
-    return (x & 7) == 0 && (y & 2) == 0;
+    return (x & 7) == 0;
  }
 }

@ -628,10 +603,10 @@ static INLINE void get_max_filter_length(uint8_t *filt_len_P, uint8_t *filt_len_
  bool transform_edge_4x4[2] = { false, false };
  bool transform_edge_8x8[2] = { false, false };
  
-  if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, tree_type);
-  if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, tree_type);
-  if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, tree_type);
-  if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, tree_type);
+  if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, comp, tree_type);
+  if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, comp, tree_type);
+  if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, comp, tree_type);
+  if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, comp, tree_type);

  if (comp == COLOR_Y) {
    if (tu_size_P_side <= 4 || tu_size_Q_side <= 4){
@ -756,8 +731,8 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
          cu_q = uvg_cu_array_at(frame->cu_array, x_coord, y);
        }

-        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
-          || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y);
+        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, COLOR_Y)
+          || cbf_is_set(cu_p->cbf, COLOR_Y);

        // Filter strength
        strength = 0;
@ -766,7 +741,6 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
        }
        else if (tu_boundary && nonzero_coeffs) {
          // Non-zero residual/coeffs and transform boundary
-          // Neither CU is intra so tr_depth <= MAX_DEPTH.
          strength = 1;
        }
        else if(cu_p->inter.mv_dir == 3 || cu_q->inter.mv_dir == 3 || state->frame->slicetype == UVG_SLICE_B) { // B-slice related checks. TODO: Need to account for cu_p being in another slice?
@ -854,18 +828,50 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
      bool is_side_Q_large = false;
      uint8_t max_filter_length_P = 0;
      uint8_t max_filter_length_Q = 0;
-      const int cu_size = LCU_WIDTH >> cu_q->depth;
-      const int pu_part_idx = (y + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? 
-                               1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
-                            + (x + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
-      const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
-                                          : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
-      const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) 
-                                         : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
+
+      const int cu_width = 1 << cu_q->log2_width;
+      const int cu_height = 1 << cu_q->log2_height;
+      const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
+      const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+      int tu_size_q_side = 0;
+      if (cu_q->type == CU_INTRA && cu_q->intra.isp_mode != ISP_MODE_NO_ISP) {
+        if (cu_q->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
+          tu_size_q_side = MAX(4, cu_width >> 2);
+        } else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_q_side = MAX(4,  cu_height >> 2);
+        } else {
+          tu_size_q_side = dir == EDGE_HOR ?
+                             MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
+                             MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+        }
+      } else {
+        tu_size_q_side = dir == EDGE_HOR ?
+                           MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
+                           MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
+      }
+
+      int tu_size_p_side = 0;
+      if (cu_p->type == CU_INTRA && cu_p->intra.isp_mode != ISP_MODE_NO_ISP) {
+        if (cu_p->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_width) >> 2);
+        } else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
+          tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2);
+        } else {
+          tu_size_p_side = dir == EDGE_HOR ?
+                             MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
+                             MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+        }
+      } else {
+        tu_size_p_side = dir == EDGE_HOR ?
+                           MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
+                           MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
+        
+      }
+
      get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
                            dir, tu_boundary,
-                            LCU_WIDTH >> cu_p->tr_depth,
-                            LCU_WIDTH >> cu_q->tr_depth,
+                            tu_size_p_side,
+                            tu_size_q_side,
                            pu_pos, pu_size, cu_q->merged, COLOR_Y,
                            UVG_LUMA_T);

@ -1073,41 +1079,44 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
      // CUs on both sides of the edge
      cu_info_t *cu_p;
      cu_info_t *cu_q;
-      int32_t x_coord = x << (tree_type != UVG_CHROMA_T);
-      int32_t y_coord = y << (tree_type != UVG_CHROMA_T);
+      int32_t x_coord = x << 1;
+      int32_t y_coord = y << 1;
      cu_array_t* cua = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
      if (dir == EDGE_VER) {
-        y_coord = (y + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
+        y_coord = (y + min_chroma_length * blk_idx) << (1);
        cu_p = uvg_cu_array_at(cua, x_coord - 1, y_coord);
        cu_q = uvg_cu_array_at(cua, x_coord    , y_coord);

      } else {
-        x_coord = (x + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
+        x_coord = (x + min_chroma_length * blk_idx) << (1);
        cu_p = uvg_cu_array_at(cua, x_coord, y_coord - 1);
        cu_q = uvg_cu_array_at(cua, x_coord, y_coord    );
      }
      
-      const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T));
-      const int pu_part_idx = ((y << (tree_type != UVG_CHROMA_T)) + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ?
-                               1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
-                              + ((x << (tree_type != UVG_CHROMA_T)) + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
-      const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
-                                          : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
-      const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx)
-                                         : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
      uint8_t max_filter_length_P = 0;
      uint8_t max_filter_length_Q = 0;
      
-      const int tu_p_size = LCU_WIDTH >> (cu_p->tr_depth + (chroma_shift));
-      const int tu_q_size = LCU_WIDTH >> (cu_q->tr_depth + (chroma_shift));
+      const int cu_width = 1 << (cu_q->log2_chroma_width );
+      const int cu_height = 1 << (cu_q->log2_chroma_height);
+      const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
+      const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
+
+
+      const int tu_size_p_side = dir == EDGE_HOR ? 
+        MIN(1 << (cu_p->log2_chroma_height), TR_MAX_WIDTH) :
+        MIN(1 << (cu_p->log2_chroma_width), TR_MAX_WIDTH);
+      const int tu_size_q_side = dir == EDGE_HOR ?
+        MIN(1 << (cu_q->log2_chroma_height ), TR_MAX_WIDTH) :
+        MIN(1 << (cu_q->log2_chroma_width ), TR_MAX_WIDTH);
+
      get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
-                            dir, tu_boundary, tu_p_size, tu_q_size,
+                            dir, tu_boundary, tu_size_p_side, tu_size_q_side,
                            pu_pos, pu_size, cu_q->merged, COLOR_U,
                            tree_type);


      const bool large_boundary = (max_filter_length_P >= 3 && max_filter_length_Q >= 3);
-      const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % (LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) == 0);
+      const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % LCU_WIDTH == 0);
      uint8_t c_strength[2] = { 0, 0 };
      

@ -1116,10 +1125,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
        c_strength[1] = 2;
      }
      else if (tu_boundary){ //TODO: Add ciip/IBC related stuff
-        bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_U)
-                                || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_U);
-        bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_V)
-                                || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_V);
+        bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, COLOR_U)
+                                || cbf_is_set(cu_p->cbf, COLOR_U);
+        bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, COLOR_V)
+                                || cbf_is_set(cu_p->cbf, COLOR_V);
        c_strength[0] = nonzero_coeffs_U ? 1 : 0;
        c_strength[1] = nonzero_coeffs_V ? 1 : 0;
      }
@ -1238,10 +1247,11 @@ static void filter_deblock_unit(
  const int32_t x_c = x >> 1;
  const int32_t y_c = y >> 1;
  if (state->encoder_control->chroma_format != UVG_CSP_400 &&
-    (is_on_8x8_grid(x_c, y_c, dir && (x_c + 4) % 32)
-     || (x == state->tile->frame->width - 8 && dir == 1 && y_c % 8 == 0)) 
+    is_tu_boundary(state, x, y, dir, COLOR_UV, tree_type)
+    && (is_on_8x8_grid(x_c, y_c, dir == EDGE_HOR && (x_c + 4) % 32 ? EDGE_HOR : EDGE_VER)
+     || (x == state->tile->frame->width - 8 && dir == EDGE_HOR && y_c % 8 == 0)) 
    && tree_type != UVG_LUMA_T) {
-    filter_deblock_edge_chroma(state, x_c, y_c, length, dir, tu_boundary, tree_type);
+    filter_deblock_edge_chroma(state, x_c, y_c, 2, dir, tu_boundary, tree_type);
  }
 }

@ -1271,11 +1281,11 @@ static void filter_deblock_lcu_inside(encoder_state_t * const state,

  for (int edge_y = y; edge_y < end_y; edge_y += 4) {
    for (int edge_x = x; edge_x < end_x; edge_x += 4) {
-      bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, luma_tree);
+      bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, COLOR_Y, luma_tree);
      if (tu_boundary || is_pu_boundary(state, edge_x, edge_y, dir)) {
        filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, luma_tree);
      }
-      if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, chroma_tree)) {
+      if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, COLOR_UV, chroma_tree)) {
        filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, chroma_tree);        
      }
    }
@ -1302,7 +1312,7 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
  for (int x = x_px - 8; x < x_px; x += 4) {
    for (int y = y_px; y < end; y += 4) {
      // The top edge of the whole frame is not filtered.
-      bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, luma_tree);
+      bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, COLOR_Y, luma_tree);
      if (y > 0 && (tu_boundary || is_pu_boundary(state, x, y, EDGE_HOR))) {
        filter_deblock_edge_luma(state, x, y, 4, EDGE_HOR, tu_boundary);
      }
@ -1313,13 +1323,15 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
  if (state->encoder_control->chroma_format != UVG_CSP_400) {
    const int x_px_c = x_px >> 1;
    const int y_px_c = y_px >> 1;
-    const int x_c = x_px_c - 4;
-    const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
-    for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
+    int x_c = x_px_c - 4;
+    const int end_c_y = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
+    for(; x_c < x_px_c; x_c += 2) {
+      for (int y_c = y_px_c; y_c < end_c_y; y_c += 8) {
        // The top edge of the whole frame is not filtered.
-      bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, chroma_tree);
+        bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, COLOR_UV, chroma_tree);
        if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) {
-        filter_deblock_edge_chroma(state, x_c , y_c, 4, EDGE_HOR, tu_boundary, chroma_tree);
+          filter_deblock_edge_chroma(state, x_c , y_c, 2, EDGE_HOR, tu_boundary, chroma_tree);
+        }
      }
    }
  }
--- a/src/filter.h
+++ b/src/filter.h
@ -46,8 +46,8 @@
 * \brief Edge direction.
 */
 typedef enum edge_dir {
-  EDGE_VER = 0, // vertical
-  EDGE_HOR = 1, // horizontal
+  EDGE_VER = 1, // vertical
+  EDGE_HOR = 2, // horizontal
 } edge_dir;


--- a/src/global.h
+++ b/src/global.h
@ -145,11 +145,11 @@ typedef int32_t mv_t;

 #define INTERNAL_MV_PREC 4 // Internal motion vector precision, 4 = 1/16 pel

-//! Limits for prediction block sizes. 0 = 64x64, 4 = 4x4.
+//! Limits for prediction block sizes. 
 #define PU_DEPTH_INTER_MIN 0
-#define PU_DEPTH_INTER_MAX 3
+#define PU_DEPTH_INTER_MAX 8
 #define PU_DEPTH_INTRA_MIN 0
-#define PU_DEPTH_INTRA_MAX 4
+#define PU_DEPTH_INTRA_MAX 8

 //! Maximum number of layers in GOP structure (for allocating structures)
 #define MAX_GOP_LAYERS 6
@ -273,7 +273,6 @@ typedef int32_t mv_t;
 #define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
 #define CLIP_TO_QP(value) CLIP(0, 51, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
-#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))

--- a/src/inter.c
+++ b/src/inter.c
@ -375,23 +375,26 @@ static void inter_cp_with_ext_border(const uvg_pixel *ref_buf, int ref_stride,
 * \param predict_luma   Enable or disable luma prediction for this call.
 * \param predict_chroma Enable or disable chroma prediction for this call.
 */
-static unsigned inter_recon_unipred(const encoder_state_t * const state,
+static unsigned inter_recon_unipred(
+  const encoder_state_t * const state,
  const uvg_picture * const ref,
-                                    int32_t pu_x,
-                                    int32_t pu_y,
-                                    int32_t pu_w,
-                                    int32_t pu_h,
  int32_t out_stride_luma,
  const mv_t mv_param[2],
  yuv_t *yuv_px,
  yuv_im_t *yuv_im,
  bool predict_luma,
-                                    bool predict_chroma)
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
  vector2d_t int_mv = { mv_param[0], mv_param[1] };

  uvg_change_precision_vector2d(INTERNAL_MV_PREC, 0, &int_mv);

+  const int pu_x = cu_loc->x;
+  const int pu_y = cu_loc->y;
+  const int pu_w = cu_loc->width;
+  const int pu_h = cu_loc->height;
+
  const vector2d_t int_mv_in_frame = {
    int_mv.x + pu_x + state->tile->offset_x,
    int_mv.y + pu_y + state->tile->offset_y
@ -507,17 +510,15 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
 * \param predict_luma   Enable or disable luma prediction for this call.
 * \param predict_chroma Enable or disable chroma prediction for this call.
 */
-void uvg_inter_recon_bipred(const encoder_state_t *const state,
+void uvg_inter_recon_bipred(
+  const encoder_state_t *const state,
  const uvg_picture *ref1,
  const uvg_picture *ref2,
-  int32_t pu_x,
-  int32_t pu_y,
-  int32_t pu_w,
-  int32_t pu_h,
  mv_t mv_param[2][2],
  lcu_t *lcu,
  bool predict_luma,
-  bool predict_chroma)
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
  // Allocate maximum size arrays for interpolated and copied samples
  ALIGNED(64) uvg_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
@ -525,6 +526,11 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
  ALIGNED(64) uvg_pixel_im im_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
  ALIGNED(64) uvg_pixel_im im_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];

+  const int pu_x = cu_loc->x;
+  const int pu_y = cu_loc->y;
+  const int pu_w = cu_loc->width;
+  const int pu_h = cu_loc->height;
+
  yuv_t px_L0;
  px_L0.size = pu_w * pu_h;
  px_L0.y = &px_buf_L0[0];
@ -551,10 +557,10 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,

  // Sample blocks from both reference picture lists.
  // Flags state if the outputs were written to high-precision / interpolated sample buffers.
-  unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[0],
-                                             &px_L0, &im_L0, predict_luma, predict_chroma);
-  unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[1],
-                                             &px_L1, &im_L1, predict_luma, predict_chroma);
+  unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_w, mv_param[0], &px_L0, &im_L0, predict_luma, predict_chroma,
+                                             cu_loc);
+  unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_w, mv_param[1], &px_L1, &im_L1, predict_luma, predict_chroma,
+                                             cu_loc);

  // After reconstruction, merge the predictors by taking an average of each pixel
  uvg_bipred_average(lcu, &px_L0, &px_L1, &im_L0, &im_L1,
@ -578,19 +584,14 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
 * \param predict_luma   Enable or disable luma prediction for this call.
 * \param predict_chroma Enable or disable chroma prediction for this call.
 */
-void uvg_inter_recon_cu(const encoder_state_t * const state,
+void uvg_inter_recon_cu(
+  const encoder_state_t * const state,
  lcu_t *lcu,
-                        int32_t x,
-                        int32_t y,
-                        int32_t width,
  bool predict_luma,
-                        bool predict_chroma)
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc)
 {
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-  const int num_pu = uvg_part_mode_num_parts[cu->part_size];
-  for (int i = 0; i < num_pu; ++i) {
-    uvg_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i);
-  }
+  uvg_inter_pred_pu(state, lcu, predict_luma, predict_chroma, cu_loc);  
 }

 static void ibc_recon_cu(const encoder_state_t * const state,
@ -599,8 +600,7 @@ static void ibc_recon_cu(const encoder_state_t * const state,
                         int32_t y,
                         int32_t width,
                         bool predict_luma,
-                         bool predict_chroma,
-                         int i_pu)
+                         bool predict_chroma)
 {
  const int x_scu    = SUB_SCU(x);
  const int y_scu    = SUB_SCU(y);
@ -668,79 +668,63 @@ static void ibc_recon_cu(const encoder_state_t * const state,
 * \param predict_chroma Enable or disable chroma prediction for this call.
 * \param i_pu           Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP.
 */
-void uvg_inter_pred_pu(const encoder_state_t * const state,
+void uvg_inter_pred_pu(
+  const encoder_state_t * const state,
  lcu_t *lcu,
-                       int32_t x,
-                       int32_t y,
-                       int32_t width,
  bool predict_luma,
  bool predict_chroma,
-                       int i_pu)
+  const cu_loc_t* const cu_loc)

 {
-  const int x_scu = SUB_SCU(x);
-  const int y_scu = SUB_SCU(y);
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
-  const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
-  const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
-  const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
-  const int pu_h = PU_GET_H(cu->part_size, width, i_pu);
-  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
-
-  if (cu->type == CU_IBC) {
-    ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu);
-  } else {
+  const int x_scu = SUB_SCU(cu_loc->x);
+  const int y_scu = SUB_SCU(cu_loc->y);
+  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);

  if (pu->inter.mv_dir == 3) {
    const uvg_picture *const refs[2] = {
-        state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]],
-        state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]],
+      state->frame->ref->images[
+        state->frame->ref_LX[0][
+          pu->inter.mv_ref[0]]],
+      state->frame->ref->images[
+        state->frame->ref_LX[1][
+          pu->inter.mv_ref[1]]],
    };
-      uvg_inter_recon_bipred(
-        state,
-        refs[0],
-        refs[1],
-        pu_x,
-        pu_y,
-        pu_w,
-        pu_h,
-        pu->inter.mv,
-        lcu,
-        predict_luma,
-        predict_chroma);
+    uvg_inter_recon_bipred(state,
+                           refs[0], refs[1],
+                           pu->inter.mv, lcu,
+                           predict_luma, predict_chroma,
+                           cu_loc);
+  }
+  else if (pu->type == CU_IBC) {
+    ibc_recon_cu(state, lcu, cu_loc->x, cu_loc->y, cu_loc->width, predict_luma, predict_chroma);
  } else{
    const int mv_idx = pu->inter.mv_dir - 1;
    const uvg_picture *const ref =
-        state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]];
+      state->frame->ref->images[
+        state->frame->ref_LX[mv_idx][
+          pu->inter.mv_ref[mv_idx]]];

-      const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
-      const unsigned offset_chroma =
-        SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
+    const unsigned offset_luma = SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x);
+    const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2;
    yuv_t lcu_adapter;
-      lcu_adapter.size = pu_w * pu_h;
-      lcu_adapter.y    = lcu->rec.y + offset_luma,
-      lcu_adapter.u    = lcu->rec.u + offset_chroma,
-      lcu_adapter.v    = lcu->rec.v + offset_chroma,
+    lcu_adapter.size = cu_loc->width * cu_loc->height;
+    lcu_adapter.y = lcu->rec.y + offset_luma;
+    lcu_adapter.u = lcu->rec.u + offset_chroma;
+    lcu_adapter.v = lcu->rec.v + offset_chroma;

-      inter_recon_unipred(
-        state,
+    inter_recon_unipred(state,
                        ref,
-        pu_x,
-        pu_y,
-        pu_w,
-        pu_h,
-        LCU_WIDTH,
-        pu->inter.mv[mv_idx],
+                        LCU_WIDTH, pu->inter.mv[mv_idx],
                        &lcu_adapter,
                        NULL,
                        predict_luma,
-        predict_chroma);
-    }
+                        predict_chroma,
+                        cu_loc);
  }
  if (predict_chroma && state->encoder_control->cfg.jccr) {
    const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
-    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
-    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
  }
 }

@ -915,11 +899,9 @@ static bool is_b0_cand_coded(int x, int y, int width, int height)
 * \param ref_idx   index in the reference list
 * \param cand_out  will be filled with C0 and C1 candidates
 */
-static void get_temporal_merge_candidates(const encoder_state_t * const state,
-                                          int32_t x,
-                                          int32_t y,
-                                          int32_t width,
-                                          int32_t height,
+static void get_temporal_merge_candidates(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
  uint8_t ref_list,
  uint8_t ref_idx,
  merge_candidates_t *cand_out)
@ -951,8 +933,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
    cu_array_t *ref_cu_array = state->frame->ref->cu_arrays[colocated_ref];
    int cu_per_width = ref_cu_array->width / SCU_WIDTH;

-    int32_t xColBr = x + width;
-    int32_t yColBr = y + height;
+    int32_t xColBr = cu_loc->x + cu_loc->width;
+    int32_t yColBr = cu_loc->y + cu_loc->height;

    // C0 must be available
    if (xColBr < state->encoder_control->in.width &&
@ -972,8 +954,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
        }
      }
    }
-    int32_t xColCtr = x + (width / 2);
-    int32_t yColCtr = y + (height / 2);
+    int32_t xColCtr = cu_loc->x + (cu_loc->width / 2);
+    int32_t yColCtr = cu_loc->y + (cu_loc->height / 2);

    // C1 must be inside the LCU, in the center position of current CU
    if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) {
@ -1254,10 +1236,7 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state,
 * \param lcu             current LCU
 * \param cand_out        will be filled with A and B candidates
 */
-static void get_spatial_merge_candidates(int32_t x,
-                                         int32_t y,
-                                         int32_t width,
-                                         int32_t height,
+static void get_spatial_merge_candidates(const cu_loc_t* const cu_loc,
                                         int32_t picture_width,
                                         int32_t picture_height,
                                         lcu_t *lcu,
@ -1276,8 +1255,13 @@ static void get_spatial_merge_candidates(int32_t x,
  |A1|_________|
  |A0|
  */
-  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
-  int32_t y_local = SUB_SCU(y);
+  const int32_t x_local = SUB_SCU(cu_loc->x); //!< coordinates from top-left of this LCU
+  const int32_t y_local = SUB_SCU(cu_loc->y);
+
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
  // A0 and A1 availability testing
  if (x != 0) {
    cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
@ -1350,15 +1334,13 @@ static void get_spatial_merge_candidates(int32_t x,
 * \param picture_height  tile height in pixels
 * \param cand_out        will be filled with A and B candidates
 */
-static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
-                                             int32_t x,
-                                             int32_t y,
-                                             int32_t width,
-                                             int32_t height,
+static void get_spatial_merge_candidates_cua(
+  const cu_array_t *cua,
  int32_t picture_width,
  int32_t picture_height,
  merge_candidates_t *cand_out,
-                                             bool wpp)
+  bool wpp,
+  const cu_loc_t* const cu_loc)
 {
  /*
  Predictor block locations
@ -1370,8 +1352,12 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
  |A1|_________|
  |A0|
  */
-  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
-  int32_t y_local = SUB_SCU(y);
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;
+  const int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
+  const int32_t y_local = SUB_SCU(y);
  // A0 and A1 availability testing
  if (x != 0) {
    const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1);
@ -1484,15 +1470,13 @@ static bool add_temporal_candidate(const encoder_state_t *state,
 /**
 * \brief Pick two mv candidates from the spatial and temporal candidates.
 */
-static void get_mv_cand_from_candidates(const encoder_state_t * const state,
-                                        int32_t x,
-                                        int32_t y,
-                                        int32_t width,
-                                        int32_t height,
+static void get_mv_cand_from_candidates(
+  const encoder_state_t * const state,
  const merge_candidates_t *merge_cand,
  const cu_info_t * const cur_cu,
  int8_t reflist,
-                                        mv_t mv_cand[2][2])
+  mv_t mv_cand[2][2],
+  int ctu_row)
 {
  const cu_info_t *const *a = merge_cand->a;
  const cu_info_t *const *b = merge_cand->b;
@ -1552,7 +1536,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,

  if (candidates < AMVP_MAX_NUM_CANDS)
  {
-    const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
    const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
    int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];
    for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) {
@ -1595,32 +1578,30 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
 * \param lcu       current LCU
 * \param reflist   reflist index (either 0 or 1)
 */
-void uvg_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
+void uvg_inter_get_mv_cand(
+  const encoder_state_t * const state,
  mv_t mv_cand[2][2],
  const cu_info_t  * const cur_cu,
  lcu_t *lcu,
-                           int8_t reflist)
+  int8_t reflist,
+  const cu_loc_t* const cu_loc)
 {
  merge_candidates_t merge_cand = { 0 };
  const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
  if (cur_cu->type == CU_IBC) {
    mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
    memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
    memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);
  } else { 
-    get_spatial_merge_candidates(x, y, width, height,
-                                 state->tile->frame->width,
-                                 state->tile->frame->height,
-                                 lcu,
-                                 &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp);
-    get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
-    get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+    get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
+                                 &merge_cand,
+                                 parallel_merge_level,
+                                 state->encoder_control->cfg.wpp);
+    get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
+    get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
  }
+    
  uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
  uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
 }
@ -1637,31 +1618,29 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
 * \param cur_cu    current CU
 * \param reflist   reflist index (either 0 or 1)
 */
-void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
-                               int32_t x,
-                               int32_t y,
-                               int32_t width,
-                               int32_t height,
+void uvg_inter_get_mv_cand_cua(
+  const encoder_state_t * const state,
  mv_t mv_cand[2][2],
  const cu_info_t* cur_cu,
-                               int8_t reflist)
+  int8_t reflist,
+  const cu_loc_t* const cu_loc)
 {
  merge_candidates_t merge_cand = { 0 };

  const cu_array_t *cua = state->tile->frame->cu_array;
  if (cur_cu->type == CU_IBC) {
    mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu, NULL,cua,cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
    memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
    memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);    
  } else {
    get_spatial_merge_candidates_cua(cua,
-                                     x, y, width, height,
-                                     state->tile->frame->width, state->tile->frame->height,
-                                     &merge_cand, state->encoder_control->cfg.wpp);
-    get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
-    get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+                                     state->tile->frame->width, state->tile->frame->height, &merge_cand, state->encoder_control->cfg.wpp,
+                                     cu_loc);
+    get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
+    get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
  }
+
  uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
  uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
 }
@ -1885,10 +1864,9 @@ void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv) {
 * \param lcu       lcu containing the block
 * \return          number of merge candidates
 */
-uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
-                                 int32_t x, int32_t y,
-                                 int32_t width, int32_t height,
-                                 bool use_a1, bool use_b1,
+uint8_t uvg_inter_get_merge_cand(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
  inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
  lcu_t *lcu)
 {
@ -1897,11 +1875,12 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
  const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
  merge_candidates_t merge_cand = { 0 };
  const uint8_t max_num_cands = state->encoder_control->cfg.max_merge;
+  // Current CU
+  cu_info_t         *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(cu_loc->x), SUB_SCU(cu_loc->y));

-  cu_info_t         *cur_cu        = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
  if(cur_cu->type == CU_IBC) {
    mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
-    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
+    get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
    for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) {
      mv_cand[i].dir = 1;
      mv_cand[i].mv[0][0] = ibc_mv_cand[i][0];
@ -1909,18 +1888,16 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
    }
    return IBC_MRG_MAX_NUM_CANDS;
  }
-
-  get_spatial_merge_candidates(x, y, width, height,
-                               state->tile->frame->width,
-                               state->tile->frame->height,
-                               lcu,
-                               &merge_cand, parallel_merge_level, state->encoder_control->cfg.wpp);
+  get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
+                               &merge_cand,
+                               parallel_merge_level,
+                               state->encoder_control->cfg.wpp);

  const cu_info_t **a = merge_cand.a;
  const cu_info_t **b = merge_cand.b;

-  if (!use_a1) a[1] = NULL;
-  if (!use_b1) b[1] = NULL;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;

  if (different_mer(x, y, x, y - 1, parallel_merge_level) && add_merge_candidate(b[1], NULL, NULL, &mv_cand[candidates])) candidates++;
  if (different_mer(x, y, x - 1, y, parallel_merge_level) && add_merge_candidate(a[1], b[1], NULL, &mv_cand[candidates])) candidates++;
@ -1941,7 +1918,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
    for (int reflist = 0; reflist <= max_reflist; reflist++) {
      // Fetch temporal candidates for the current CU
      // ToDo: change collocated_from_l0_flag to allow L1 ref
-      get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
+      get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
      // TODO: enable L1 TMVP candidate
      // get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand);

@ -1973,7 +1950,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
  if (candidates == max_num_cands) return candidates;

  if (candidates != max_num_cands - 1) {
-    const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
+    const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH);
    const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
    int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];

--- a/src/inter.h
+++ b/src/inter.h
@ -58,61 +58,51 @@ void uvg_change_precision_vector2d(int src, int dst, vector2d_t* mv);
 void uvg_round_precision(int src, int dst, mv_t* hor, mv_t* ver);
 void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv);

-void uvg_inter_recon_cu(const encoder_state_t * const state,
+void uvg_inter_recon_cu(
+  const encoder_state_t * const state,
  lcu_t *lcu,
-                        int32_t x,
-                        int32_t y,
-                        int32_t width,
-                        bool predict_luma,
-                        bool predict_chroma);
-
-void uvg_inter_pred_pu(const encoder_state_t * const state,
-  lcu_t *lcu,
-  int32_t x,
-  int32_t y,
-  int32_t width,
  bool predict_luma,
  bool predict_chroma,
-  int i_pu);
+  const cu_loc_t* const cu_loc);
+
+void uvg_inter_pred_pu(
+  const encoder_state_t * const state,
+  lcu_t *lcu,
+  bool predict_luma,
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc);

 void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu);

-void uvg_inter_recon_bipred(const encoder_state_t * const state,
+void uvg_inter_recon_bipred(
+  const encoder_state_t * const state,
  const uvg_picture * ref1,
  const uvg_picture * ref2,
-                            int32_t xpos,
-                            int32_t ypos,
-                            int32_t width,
-                            int32_t height,
  mv_t mv_param[2][2],
  lcu_t* lcu,
  bool predict_luma,
-                            bool predict_chroma);
+  bool predict_chroma,
+  const cu_loc_t* const cu_loc);


-void uvg_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
+void uvg_inter_get_mv_cand(
+  const encoder_state_t * const state,
  mv_t mv_cand[2][2],
  const cu_info_t* cur_cu,
  lcu_t *lcu,
-                           int8_t reflist);
+  int8_t reflist,
+  const cu_loc_t* const cu_loc);

-void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
-                               int32_t x,
-                               int32_t y,
-                               int32_t width,
-                               int32_t height,
+void uvg_inter_get_mv_cand_cua(
+  const encoder_state_t * const state,
  mv_t mv_cand[2][2],
  const cu_info_t* cur_cu,
-                               int8_t reflist);
+  int8_t reflist,
+  const cu_loc_t* const cu_loc);

-uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
-                                 int32_t x, int32_t y,
-                                 int32_t width, int32_t height,
-                                 bool use_a1, bool use_b1,
+uint8_t uvg_inter_get_merge_cand(
+  const encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
  inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
  lcu_t *lcu);
 #endif
--- a/src/intra.c
+++ b/src/intra.c
--- a/src/intra.h
+++ b/src/intra.h
@ -71,6 +71,7 @@ typedef struct {
  double coeff_bits;
  double distortion;
  double lfnst_costs[3];
+  uint8_t best_isp_cbfs;
 } intra_search_data_t ;


@ -107,7 +108,9 @@ int8_t uvg_intra_get_dir_luma_predictor(
 * \param multi_ref_idx Multi reference line index for the prediction block.
 */
 void uvg_intra_build_reference(
-  const int_fast8_t log2_width,
+  const encoder_state_t* const state,
+  const cu_loc_t* const pu_loc,
+  const cu_loc_t* const cu_loc,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
@ -115,7 +118,8 @@ void uvg_intra_build_reference(
  uvg_intra_references *const refs,
  bool entropy_sync,
  uvg_pixel *extra_refs,
-  uint8_t multi_ref_idx);
+  uint8_t multi_ref_idx,
+  const uint8_t isp_mode);

 /**
 * \brief Generate intra predictions.
@ -130,32 +134,60 @@ void uvg_intra_predict(
  const encoder_state_t* const state,
  uvg_intra_references* const refs,
  const cu_loc_t* const cu_loc,
+  const cu_loc_t* const pu_loc,
  const color_t color,
  uvg_pixel* dst,
  const intra_search_data_t* data,
-  const lcu_t* lcu,
-  enum uvg_tree_type tree_type
+  const lcu_t* lcu
 );

 void uvg_intra_recon_cu(
  encoder_state_t* const state,
-  int x,
-  int y,
-  int depth,
  intra_search_data_t* search_data,
+  const cu_loc_t* cu_loc,
  cu_info_t *cur_cu,
  lcu_t *lcu,
  enum uvg_tree_type tree_type,
  bool recon_luma,
  bool recon_chroma);

-const cu_info_t* uvg_get_co_located_luma_cu(
-  int x,
-  int y,
-  int width,
-  int height,
+double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
+                                       const cu_loc_t* const cu_loc,
+                                       double cost_treshold,
+                                       intra_search_data_t* const search_data,
+                                       lcu_t* const lcu, bool* violates_lfnst);
+
+int8_t uvg_get_co_located_luma_mode(
+  const cu_loc_t* const chroma_loc,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t* luma_cu,
  const lcu_t* const lcu,
  const cu_array_t* const cu_array,
  enum uvg_tree_type tree_type);
+bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t* const luma_loc, cu_info_t const* const cur_cu, enum
+                         uvg_tree_type tree_type);

-int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
+uint8_t uvg_get_mip_flag_context(
+  const cu_loc_t* const cu_loc,
+  const lcu_t* lcu,
+  cu_array_t* const cu_a);
+
+int8_t uvg_wide_angle_correction(
+  int_fast8_t mode,
+  const int log2_width,
+  const int log2_height,
+  const bool account_for_dc_planar);
+
+// ISP related defines
+#define NUM_ISP_MODES 3
+#define ISP_MODE_NO_ISP 0
+#define ISP_MODE_HOR 1
+#define ISP_MODE_VER 2
+#define SPLIT_TYPE_HOR 1
+#define SPLIT_TYPE_VER 2
+
+int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_block);
+int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_block);
+void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_block);
+bool uvg_can_use_isp(const int width, const int height);
+bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);
--- a/src/rate_control.c
+++ b/src/rate_control.c
@ -795,12 +795,20 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
      state->frame->QP + 2 + frame_allocation,
      est_qp);
  }
+  if(state->encoder_control->cfg.dep_quant) {
+    est_lambda *= pow(2, 0.25 / 3.0);
+  }

  state->lambda = est_lambda;
  state->lambda_sqrt = sqrt(est_lambda);
  state->qp = est_qp;
  int8_t chroma_qp = encoder->qp_map[0][est_qp];
  double tmpWeight = pow(2.0, (est_qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant)
+  {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0));  // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
  state->c_lambda = est_lambda / tmpWeight;
  ctu->qp = est_qp;
  ctu->lambda = est_lambda;
@ -820,7 +828,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
    state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
    state->qp = CLIP_TO_QP(state->qp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
    state->lambda_sqrt = sqrt(state->lambda);
    
    ctu->adjust_lambda = state->lambda;
@ -1103,7 +1115,12 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
      pos.x = 0;
    }
    state->qp = CLIP_TO_QP(state->frame->QP + dqp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
    state->lambda_sqrt = sqrt(state->lambda);
  }
  else if (ctrl->cfg.target_bitrate > 0) {
@ -1138,6 +1155,9 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
                  state->frame->lambda * 1.5874010519681994,
                  lambda);
    lambda = clip_lambda(lambda);
+    if (state->encoder_control->cfg.dep_quant) {
+      lambda *= pow(2, 0.25 / 3.0);
+    }

    state->lambda      = lambda;
    state->lambda_sqrt = sqrt(lambda);
@ -1145,8 +1165,13 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,

  } else {
    state->qp          = state->frame->QP;
-    state->lambda      = state->frame->lambda;
-    state->lambda_sqrt = sqrt(state->frame->lambda);
+    double lambda = state->frame->lambda;
+
+    if (state->encoder_control->cfg.dep_quant) {
+      lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda      = lambda;
+    state->lambda_sqrt = sqrt(lambda);
  }

  lcu->lambda = state->lambda;
@ -1154,6 +1179,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,

  int8_t chroma_qp = ctrl->qp_map[0][state->qp];
  double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant)
+  {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0));  // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
  state->c_lambda = state->lambda / tmpWeight;

  // Apply variance adaptive quantization
@ -1170,10 +1200,34 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
    state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
    state->qp = CLIP_TO_QP(state->qp);
-    state->lambda = qp_to_lambda(state, state->qp);
+    double to_lambda = qp_to_lambda(state, state->qp);
+    if (state->encoder_control->cfg.dep_quant) {
+      to_lambda *= pow(2, 0.25 / 3.0);
+    }
+    state->lambda = to_lambda;
    state->lambda_sqrt = sqrt(state->lambda);

    lcu->adjust_lambda = state->lambda;
    lcu->adjust_qp = state->qp;
  }
 }
+
+
+double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode)
+{
+  const encoder_control_t * const ctrl = state->encoder_control;
+  double lambda = state->lambda;
+  int8_t chroma_qp = ctrl->qp_map[0][state->qp];
+  double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
+  if (state->encoder_control->cfg.dep_quant) {
+    tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
+  }
+  lambda /= tmpWeight;
+  lambda *= use_jccr && state->qp > 18 ? 1.3 : 1.0;
+  if (jccr_mode == 1 || jccr_mode == 2) {
+    lambda *= 0.8;
+  } else if (jccr_mode == 3) {
+    lambda *= 0.5;
+  }
+  return lambda;
+}
--- a/src/rate_control.h
+++ b/src/rate_control.h
@ -76,4 +76,6 @@ void uvg_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos);
 void uvg_update_after_picture(encoder_state_t * const state);
 void uvg_estimate_pic_lambda(encoder_state_t * const state);

+double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode);
+
 #endif // RATE_CONTROL_H_
--- a/src/rdo.c
+++ b/src/rdo.c
@ -33,6 +33,7 @@
 #include "rdo.h"

 #include <errno.h>
+#include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
@ -52,7 +53,6 @@
 #include "strategies/strategies-quant.h"


-#define QUANT_SHIFT          14
 #define SCAN_SET_SIZE        16
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4
@ -297,15 +297,20 @@ out:
 static INLINE double get_coeff_cabac_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
-  int32_t width,
+  const cu_loc_t* const cu_loc,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip,
  cu_info_t* cur_tu)
 {
+  const int width  = cu_loc->width;
+  const int height = cu_loc->height;
+  const int sub_coeff_w = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+
  // Make sure there are coeffs present
  bool found = false;
-  for (int i = 0; i < width*width; i++) {
+  for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) {
    if (coeff[i] != 0) {
      found = 1;
      break;
@ -330,7 +335,7 @@ static INLINE double get_coeff_cabac_cost(
    uvg_encode_coeff_nxn((encoder_state_t*) state,
                         &cabac_copy,
                         coeff,
-                         width,
+                         cu_loc,
                         color,
                         scan_mode,
                         cur_tu,                   
@ -341,6 +346,7 @@ static INLINE double get_coeff_cabac_cost(
      &cabac_copy,
      coeff,
      width,
+      height,
      color,
      scan_mode,
      &bits);
@ -391,14 +397,36 @@ double uvg_get_coeff_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
  cu_info_t* cur_tu,
-  int32_t width,
+  const cu_loc_t* const cu_loc,
  color_t color,
  int8_t scan_mode,
-  int8_t tr_skip)
+  int8_t tr_skip,
+  int coeff_order)
 {
  uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
  uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;

+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  int x_local = cu_loc->x % LCU_WIDTH;
+  int y_local = cu_loc->y % LCU_WIDTH;
+  const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C;
+
+
+  const coeff_t* coeff_ptr = NULL;
+  coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+  if (coeff_order == COEFF_ORDER_LINEAR) {
+    coeff_ptr = coeff;
+  }
+  else {
+    // Coeff order CU
+    uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
+    coeff_ptr = sub_coeff;
+  }
+
  if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
      state->qp < MAX_FAST_COEFF_COST_QP && !tr_skip) {
    // TODO: do we need to assert(0) out of the fast-estimation branch if we
@ -409,17 +437,17 @@ double uvg_get_coeff_cost(
      return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
    } else {
      uint64_t weights = uvg_fast_coeff_get_weights(state);
-      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
+      uint32_t fast_cost = uvg_fast_coeff_cost(coeff_ptr, width, height, weights);
      if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
        save_accuracy(state->qp, ccc, fast_cost);
      }
      return fast_cost;
    }
  } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
    if (save_cccs) {
-      save_ccc(state->qp, coeff, width * width, ccc);
+      save_ccc(state->qp, coeff, width * height, ccc);
    }
    return ccc;
  }
@ -684,12 +712,13 @@ void uvg_rdoq_sign_hiding(
  const int32_t last_pos,
  const coeff_t *const coeffs,
  coeff_t *const quant_coeffs,
-    const int8_t color)
+  const int8_t color,
+  const bool need_sqrt_adjust)
 {
  const encoder_control_t * const ctrl = state->encoder_control;
  const double lambda = color ? state->c_lambda : state->lambda;

-  int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6];
+  int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6];
  // This somehow scales quant_delta into fractional bits. Instead of the bits
  // being multiplied by lambda, the residual is divided by it, or something
  // like that.
@ -814,28 +843,28 @@ void uvg_rdoq_sign_hiding(
  }
 }

-static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t  posX, uint32_t  posY, uint32_t width, uint32_t height)
+static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t  posX, uint32_t  posY, uint32_t width, uint32_t height, uint8_t mts_index)
 {
  const coeff_t* pData = coeff + posX + posY * width;
  coeff_t          sum = 0;
  if (posX < width - 1)
  {
-    sum += abs(pData[1]);
+    sum += mts_index && posX + 1 >= 16 ? 0 : abs(pData[1]);
    if (posX < width - 2)
    {
-      sum += abs(pData[2]);
+      sum += mts_index && posX + 2 >= 16 ? 0 : abs(pData[2]);
    }
    if (posY < height - 1)
    {
-      sum += abs(pData[width + 1]);
+      sum += mts_index && (posY + 1 >= 16 || posX + 1 >= 16) ? 0 : abs(pData[width + 1]);
    }
  }
  if (posY < height - 1)
  {
-    sum += abs(pData[width]);
+    sum += mts_index && posY + 1 >= 16 ? 0 : abs(pData[width]);
    if (posY < height - 2)
    {
-      sum += abs(pData[width << 1]);
+      sum += mts_index && posY + 2 >= 16 ? 0 : abs(pData[width << 1]);
    }
  }
  return MAX(MIN(sum - 5 * baseLevel, 31), 0);
@ -1141,7 +1170,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
  const int  max_log2_tr_dynamic_range = 15;
  uint32_t log2_tr_width = uvg_math_floor_log2(width);
  uint32_t log2_tr_height = uvg_math_floor_log2(height);
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
  const uint32_t log2_cg_width = g_log2_sbb_size[log2_tr_width][log2_tr_height][0];
  const uint32_t log2_cg_height = g_log2_sbb_size[log2_tr_width][log2_tr_height][1];

@ -1166,15 +1196,18 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_

  switch (cg_num) {
  case  1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); FILL_ARRAY(cost_coeffgroup_sig, 0, 1); break;
+  case  2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); FILL_ARRAY(cost_coeffgroup_sig, 0, 2); break;
  case  4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); FILL_ARRAY(cost_coeffgroup_sig, 0, 4);  break;
+  case  8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); FILL_ARRAY(cost_coeffgroup_sig, 0, 8);  break;
  case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); FILL_ARRAY(cost_coeffgroup_sig, 0, 16);  break;
+  case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); FILL_ARRAY(cost_coeffgroup_sig, 0, 32);  break;
  case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); FILL_ARRAY(cost_coeffgroup_sig, 0, 64); break;
  default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
  }

  const bool   needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation.
  const int    q_bits = QUANT_SHIFT + qp_scaled / 6  + (needs_sqrt2_scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
-  const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6];
+  const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6];
 
  const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff;

@ -1182,8 +1215,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_

  const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;

-  const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);

  uint32_t coeff_levels[3];
  double   coeff_level_error[4];
@ -1221,8 +1254,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
      scan_pos = (sbId << log2_cg_size) + scan_pos_in_sb;
      int last_pos_coded = sbSizeM1;
      uint32_t blkpos = scan[scan_pos];
-      uint32_t  pos_y = blkpos >> log2_block_size;
-      uint32_t  pos_x = blkpos - (pos_y << log2_block_size);
+      uint32_t  pos_y = blkpos >> log2_block_width;
+      uint32_t  pos_x = blkpos - (pos_y << log2_block_width); 
      //===== quantization =====

      // set coeff
@ -1365,6 +1398,48 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
  return abs_sum;
 }

+
+static uint32_t context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
+                                            uint32_t width, uint32_t height, int8_t color,
+                                            int32_t* temp_diag, int32_t* temp_sum, int8_t mts)
+{
+  const coeff_t* data = coeff + pos_x + pos_y * width;
+  const int     diag = pos_x + pos_y;
+  int           num_pos = 0;
+  int           sum_abs = 0;
+#define UPDATE(x) {int a=abs(x);sum_abs+=MIN(4+(a&1),a);num_pos+=(a?1:0);}
+  if (pos_x < width - 1)
+  {
+    UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[1]);
+    if (pos_x < width - 2)
+    {
+      UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[2]);
+    }
+    if (pos_y < height - 1)
+    {
+      UPDATE(mts && (pos_y + 1 >= 16 || pos_x + 1 >= 16) ? 0 : data[width + 1]);
+    }
+  }
+  if (pos_y < height - 1)
+  {
+    UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[width]);
+    if (pos_y < height - 2)
+    {
+      UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[width << 1]);
+    }
+  }
+#undef UPDATE
+  int ctx_ofs = MIN((sum_abs + 1) >> 1, 3) + (diag < 2 ? 4 : 0);
+  if (color == COLOR_Y)
+  {
+    ctx_ofs += diag < 5 ? 4 : 0;
+  }
+
+  *temp_diag = diag;
+  *temp_sum = sum_abs - num_pos;
+  return ctx_ofs;
+}
+
 /** RDOQ with CABAC
 * \returns void
 * Rate distortion optimized quantization for entropy
@ -1377,31 +1452,35 @@ void uvg_rdoq(
  coeff_t *dest_coeff,
  int32_t width,
  int32_t height,
-  int8_t type,
+  int8_t color,
  int8_t scan_mode,
  int8_t block_type,
-  int8_t tr_depth,
  uint16_t cbf,
-  uint8_t lfnst_idx)
+  uint8_t lfnst_idx, uint8_t mts_idx)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  cabac_data_t * const cabac = &state->cabac;
-  uint32_t log2_tr_width      = uvg_math_floor_log2( height );
-  uint32_t log2_tr_height      = uvg_math_floor_log2( width );
-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1);  // Represents scaling through forward transform
+  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
  uint16_t go_rice_param     = 0;
  uint32_t reg_bins = (width * height * 28) >> 4;
-  const uint32_t log2_block_size   = uvg_g_convert_to_bit[ width ] + 2;
-  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + type;
  
-  int32_t qp_scaled = uvg_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color;

-  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
+  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  
-  const double lambda = type ? state->c_lambda : state->lambda;
+  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift - needs_block_size_trafo_scale;

-  const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
-  const double *err_scale     = encoder->scaling_list.error_scale[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
+  const double lambda = color ? state->c_lambda : state->lambda;
+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+  const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
+
+  const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
+  const double *err_scale     = encoder->scaling_list.error_scale[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];

  double block_uncoded_cost = 0;
  
@ -1415,14 +1494,19 @@ void uvg_rdoq(

  memset(dest_coeff, 0, sizeof(coeff_t) * width * height);

-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t log2_cg_width  = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
+  const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];

-  const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
+  const uint32_t cg_width  = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
+  const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
+
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);

-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
  const uint32_t cg_size = 16;
  const int32_t  shift = 4 >> 1;
-  const uint32_t num_blk_side = width >> shift;
+  const uint32_t num_blk_side = MAX(width >> shift, 1);
  double   cost_coeffgroup_sig[ 64 ];
  uint32_t sig_coeffgroup_flag[ 64 ];

@ -1431,26 +1515,34 @@ void uvg_rdoq(
  int32_t temp_diag = -1;
  int32_t temp_sum = -1;

-  const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
-
  int32_t cg_last_scanpos = -1;
  int32_t last_scanpos = -1;

-  uint32_t cg_num = width * height >> 4;
+  uint32_t       cg_num          = lfnst_idx > 0 ? 1 : width * height >> 4;
+
+  double         dTransShift = (double)transform_shift + (needs_block_size_trafo_scale ? -0.5 : 0.0);
+  // Compensate for scaling of bitcount in Lagrange cost function
+  double scale       = CTX_FRAC_ONE_BIT;
+  // Compensate for scaling through forward transform
+  scale              = scale * pow(2.0, -2.0 * dTransShift);
+  const double  default_error_scale = scale / default_quant_coeff / default_quant_coeff;

  // Explicitly tell the only possible numbers of elements to be zeroed.
  // Hope the compiler is able to utilize this information.
  switch (cg_num) {
    case  1: FILL_ARRAY(sig_coeffgroup_flag, 0,  1); break;
+    case  2: FILL_ARRAY(sig_coeffgroup_flag, 0,  2); break;
    case  4: FILL_ARRAY(sig_coeffgroup_flag, 0,  4); break;
+    case  8: FILL_ARRAY(sig_coeffgroup_flag, 0,  8); break;
    case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
+    case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); break;
    case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
-    default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
+    default: assert(0 && "There should be 1, 2, 4, 8, 16, 32 or 64 coefficient groups");
  }

-  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[type ? 2 : 0]);
-  cabac_ctx_t *baseCtx              = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
-  cabac_ctx_t* base_gt1_ctx = (type == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[color ? 2 : 0]);
+  cabac_ctx_t *baseCtx              = (color == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
+  cabac_ctx_t* base_gt1_ctx = (color == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);

  struct {
    double coded_level_and_dist;
@ -1463,21 +1555,26 @@ void uvg_rdoq(
  //Find last cg and last scanpos
  const int max_lfnst_pos = ((height == 4 && width == 4) || (height == 8 && width == 8)) ? 7 : 15;
  int32_t   cg_scanpos;
+  uint32_t  max_scan_group_size = lfnst_idx > 0 ? max_lfnst_pos : cg_size - 1;
  for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
  {
-    for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
+    uint32_t cg_blkpos = scan_cg[cg_scanpos];
+    uint32_t cg_pos_y = cg_blkpos / num_blk_side;
+    uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side);
+    if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
+    for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)
    {
      int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
-      if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
+      
      uint32_t blkpos         = scan[scanpos];
-      int32_t q               = quant_coeff[blkpos];
+      int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
      int32_t level_double    = coef[blkpos];
      level_double            = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
      uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;

      double err = (double)level_double;

-      cost_coeff0[scanpos] = err * err * err_scale[blkpos];      
+      cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);      
      
      dest_coeff[blkpos] = max_abs_level;
      if (max_abs_level > 0) {
@ -1507,43 +1604,45 @@ void uvg_rdoq(
    uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);

    FILL(rd_stats, 0);
-    for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
+    if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
+    for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)  {
      int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
      if (scanpos > last_scanpos) {
        continue;
      }
      uint32_t blkpos         = scan[scanpos];
-      int32_t q               = quant_coeff[blkpos];
-      double temp             = err_scale[blkpos];
+      int32_t q               = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
+      double temp             = (use_scaling_list ? err_scale[blkpos] : default_error_scale);
      int32_t level_double    = coef[blkpos];
      level_double            = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
      uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
      dest_coeff[blkpos] = max_abs_level;
      double err = (double)level_double;

-      cost_coeff0[scanpos] = err * err * err_scale[blkpos];
+      cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);

      block_uncoded_cost      += cost_coeff0[ scanpos ];

      if (last_scanpos >= 0) {

-        uint32_t  pos_y = blkpos >> log2_block_size;
-        uint32_t  pos_x = blkpos - (pos_y << log2_block_size);
+        uint32_t  pos_y = blkpos >> log2_block_width;
+        uint32_t  pos_x = blkpos - (pos_y << log2_block_width);
        //===== coefficient level estimation =====
        int32_t  level;
        
        uint16_t ctx_sig = 0;
        if (scanpos != last_scanpos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, type, &temp_diag, &temp_sum);
+          // VVC document 9.3.4.2.8, context for sig_coeff_flag calculated here
+          ctx_sig = context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum, mts_idx);
        }
        
        if (temp_diag != -1) {
-          ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((type == 0) ? 15 : 5) : (type == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
+          ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((color == 0) ? 15 : 5) : (color == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
        }
        else ctx_set = 0;

        if (reg_bins < 4) {
-          int  sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height);
+          int  sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height,mts_idx);
          go_rice_param = g_auiGoRiceParsCoeff[sumAll];
        }

@ -1554,12 +1653,12 @@ void uvg_rdoq(
        if (scanpos == last_scanpos) {
          level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
            level_double, max_abs_level, 0, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
-            reg_bins, q_bits, temp, 1, type);          
+            reg_bins, q_bits, temp, 1, color);          
        }
        else {
          level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
            level_double, max_abs_level, ctx_sig, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
-            reg_bins, q_bits, temp, 0, type);
+            reg_bins, q_bits, temp, 0, color);
          if (encoder->cfg.signhide_enable) {
            int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1);
            int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0);
@ -1572,14 +1671,14 @@ void uvg_rdoq(
        if (encoder->cfg.signhide_enable) {
          sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
          if (level > 0) {
-            int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
-            sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
-            sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
+            int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
+            sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
+            sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
          }
          else { // level == 0
            if (reg_bins < 4) {
-              int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
-              sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
+              int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
+              sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
            }
            else {
              sh_rates.inc[blkpos] = CTX_ENTROPY_BITS(&base_gt1_ctx[gt1_ctx], 0);
@ -1595,7 +1694,7 @@ void uvg_rdoq(
        }
        else if (reg_bins >= 4) {
          reg_bins -= (level < 2 ? level : 3) + (scanpos != last_scanpos);
-          int  sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height);
+          int  sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height, mts_idx);
          go_rice_param = g_auiGoRiceParsCoeff[sumAll];
        }
      }
@ -1620,7 +1719,7 @@ void uvg_rdoq(
    if( cg_scanpos ) {
      if (sig_coeffgroup_flag[cg_blkpos] == 0) {
        uint32_t ctx_sig  = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                        cg_pos_y, cg_width);
+                                                        cg_pos_y, cg_width, cg_height);
        cost_coeffgroup_sig[cg_scanpos] = lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
        base_cost += cost_coeffgroup_sig[cg_scanpos]  - rd_stats.sig_cost;
      } else {
@ -1636,7 +1735,7 @@ void uvg_rdoq(

          // add SigCoeffGroupFlag cost to total cost
          ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-            cg_pos_y, cg_width);
+            cg_pos_y, cg_width, cg_height);

          cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
          base_cost += cost_coeffgroup_sig[cg_scanpos];
@ -1656,7 +1755,7 @@ void uvg_rdoq(
            cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);

            // reset coeffs to 0 in this block
-            for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+            for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
              int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
              uint32_t blkpos = scan[scanpos];
              if (dest_coeff[blkpos]){
@ -1679,12 +1778,12 @@ void uvg_rdoq(
  int8_t found_last        = 0;
  int32_t best_last_idx_p1 = 0;

-  if( block_type != CU_INTRA && !type ) {
+  if( block_type != CU_INTRA && !color ) {
    best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
    base_cost +=   lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
  } else {
    cabac_ctx_t* base_cbf_model = NULL;
-    switch (type) {
+    switch (color) {
      case COLOR_Y:
        base_cbf_model = cabac->ctx.qt_cbf_model_luma;
        break;
@ -1697,25 +1796,26 @@ void uvg_rdoq(
      default:
        assert(0);
    }
-    ctx_cbf    = ( type != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
+    // This cbf should work even with non-square blocks
+    ctx_cbf    = ( color != COLOR_V ? 0 : cbf_is_set(cbf, COLOR_U));
    best_cost  = block_uncoded_cost +  lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
    base_cost +=   lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
  }

-  calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
+  calc_last_bits(state, width, height, color, last_x_bits, last_y_bits);
  for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
    uint32_t cg_blkpos = scan_cg[cg_scanpos];
    base_cost -= cost_coeffgroup_sig[cg_scanpos];

    if (sig_coeffgroup_flag[ cg_blkpos ]) {
-      for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+      for ( int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
        int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
        if (scanpos > last_scanpos) continue;
        uint32_t blkpos  = scan[scanpos];

        if( dest_coeff[ blkpos ] ) {
-          uint32_t   pos_y = blkpos >> log2_block_size;
-          uint32_t   pos_x = blkpos - ( pos_y << log2_block_size );
+          uint32_t   pos_y = blkpos >> log2_block_width;
+          uint32_t   pos_x = blkpos - ( pos_y << log2_block_width );

          double cost_last = get_rate_last(lambda, pos_x, pos_y, last_x_bits,last_y_bits );
          double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
@ -1739,19 +1839,31 @@ void uvg_rdoq(
  } // end for

  uint32_t abs_sum = 0;
+  if(!mts_idx || (width < 32 && height < 32)) {
    for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
      int32_t blkPos     = scan[scanpos];
      int32_t level      = dest_coeff[blkPos];
      abs_sum            += level;
      dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
    }
+  }
+  else {
+    for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
+      int32_t blkPos     = scan[scanpos];
+      int32_t blk_x = blkPos & (width - 1);
+      int32_t blk_y = blkPos >> log2_block_width;
+      int32_t level      = blk_x >= 16 || blk_y >= 16 ? 0 : dest_coeff[blkPos];
+      abs_sum            += level;
+      dest_coeff[blkPos] = (coeff_t)(( level < 0 ) ? -level : level);
+    }
+  }
  //===== clean uncoded coefficients =====
  for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
    dest_coeff[scan[scanpos]] = 0;
  }

  if (encoder->cfg.signhide_enable && abs_sum >= 2) {
-    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, type);
+    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale);
  }
 }

--- a/src/rdo.h
+++ b/src/rdo.h
@ -44,6 +44,8 @@
 #include "global.h" // IWYU pragma: keep
 #include "search_inter.h"

+#define QUANT_SHIFT 14
+#define IQUANT_SHIFT 6

 extern const uint32_t uvg_g_go_rice_range[5];
 extern const uint32_t uvg_g_go_rice_prefix_len[5];
@ -60,9 +62,8 @@ void  uvg_rdoq(
  int8_t type,
  int8_t scan_mode,
  int8_t block_type,
-  int8_t tr_depth,
  uint16_t cbf,
-  uint8_t lfnst_idx);
+  uint8_t lfnst_idx, uint8_t mts_idx);


 int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_coeff, int32_t width,
@ -73,10 +74,11 @@ double uvg_get_coeff_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
  cu_info_t* cur_tu,
-  int32_t width,
+  const cu_loc_t* const cu_loc,
  color_t color,
  int8_t scan_mode,
-  int8_t tr_skip);
+  int8_t tr_skip,
+  int coeff_order);

 int32_t uvg_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_gt1, uint16_t ctx_num_gt2, uint16_t ctx_num_par,
                    uint16_t abs_go_rice, uint32_t reg_bins, int8_t type, int use_limited_prefix_length);
--- a/src/scalinglist.c
+++ b/src/scalinglist.c
@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] =
  24, 25, 28, 33, 41, 54, 71, 91
 };

-const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564};
-const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72};
+const int16_t uvg_g_quant_scales[2][6] = {
+  {26214, 23302, 20560, 18396, 16384, 14564},
+    { 18396,16384,14564,13107,11651,10280 }
+};
+const int16_t uvg_g_inv_quant_scales[2][6] = {
+  {40, 45, 51, 57, 64, 72},
+  { 57,64,72,80,90,102 }
+};


 /**
@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons
  int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp];
  int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp];

-  // Encoder list
-  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio,
+  // Encoder list TODO: the sqrt adjusted lists
+  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio,
                              MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
  // Decoder list
-  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio,
+  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio,
                          MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);


--- a/src/search.c
+++ b/src/search.c
--- a/src/search.h
+++ b/src/search.h
@ -84,19 +84,17 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);

 void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);

-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
  const cu_info_t *const pred_cu,
-                           lcu_t *const lcu);
-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
+  lcu_t *const lcu,
+  uint8_t isp_cbf);
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
  cu_info_t *const pred_cu,
-                             lcu_t *const lcu);
+  lcu_t *const lcu,
+  const cu_loc_t * const);

-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type);
-
-void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
-void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);

 #endif
--- a/src/search_ibc.c
+++ b/src/search_ibc.c
@ -75,7 +75,8 @@ typedef struct {
   * \brief Possible optimized SAD implementation for the width, leave as
   *        NULL for arbitrary-width blocks
   */
-  optimized_sad_func_ptr_t optimized_sad;
+  optimized_sad_func_ptr_t optimized_sad_y;
+  optimized_sad_func_ptr_t optimized_sad_uv;

  lcu_t                   *lcu;

@ -109,8 +110,10 @@ static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x,
 }


-static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
+  const uint32_t x = loc->x;
+  const uint32_t y = loc->y;
  const int x_scu    = SUB_SCU(x);
  const int y_scu    = SUB_SCU(y);

@ -132,9 +135,11 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
  cur_cu->inter.mv[0][0]                  = mv_x * (1 << INTERNAL_MV_PREC);;
  cur_cu->inter.mv[0][1]                  = mv_y * (1 << INTERNAL_MV_PREC);;

-  uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
+  uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
  
  *cur_cu = cu_backup;
+  uint32_t width = loc->width;
+  uint32_t height = loc->height;

  cost = uvg_satd_any_size(width,
                           width,
@ -162,10 +167,15 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
 }


-static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
+static uint32_t calculate_ibc_cost_sad(ibc_search_info_t *info, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
 {  
+  const uint32_t x = loc->x;
+  const uint32_t y = loc->y;
+  lcu_t         *lcu    = info->lcu;
  cu_info_t     *cur_cu     = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
  
+  const encoder_state_t* state = info->state;
+  
  cu_info_t cu_backup  = *cur_cu;
  uint32_t       cost  = MAX_INT;

@ -173,6 +183,8 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
  const int y_scu    = SUB_SCU(y);
  const uint32_t offset = x_scu + y_scu * LCU_WIDTH;
  const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+  const uint32_t width = loc->width;
+  const uint32_t height = loc->height;

  cur_cu->type    = CU_IBC;
  cur_cu->inter.mv_dir   = 1;
@ -183,23 +195,26 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
  cur_cu->inter.mv[0][0]                  = mv_x * (1 << INTERNAL_MV_PREC);;
  cur_cu->inter.mv[0][1]                  = mv_y * (1 << INTERNAL_MV_PREC);;

-  uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
+  uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
  
  *cur_cu = cu_backup;

-  if (optimized_sad != NULL) {
-    cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
-    if(state->encoder_control->chroma_format != UVG_CSP_400) {
-      cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
-      cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
-    }
+  if (info->optimized_sad_y != NULL) {
+    cost = info->optimized_sad_y(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
  } else {
    cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width,width, LCU_WIDTH, state->tile->frame->source->stride);
-    if(state->encoder_control->chroma_format != UVG_CSP_400) {
+  }
+
+  // ToDo: Enable chroma cost calculation
+  /* if (state->encoder_control->chroma_format != UVG_CSP_400) {
+    if (info->optimized_sad_uv != NULL) {    
+      cost += info->optimized_sad_uv(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
+      cost += info->optimized_sad_uv(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
+    } else {
      cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
      cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
    }
-  }
+  }*/

  return cost;
 }
@ -235,8 +250,11 @@ static bool check_mv_cost(ibc_search_info_t *info,

  double bitcost = 0;
  double cost    = MAX_DOUBLE;
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height);

-  cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y);
+
+  cost = calculate_ibc_cost_sad(info, &loc, x, y);

  if (cost >= *best_cost) return false;

@ -246,7 +264,7 @@ static bool check_mv_cost(ibc_search_info_t *info,
      info->mv_cand,
      NULL,
      0,
-      NULL,
+      0,
      &bitcost
  );

@ -782,39 +800,23 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
 * \param amvp        Return searched AMVP PUs sorted by costs
 * \param merge       Return searched Merge PUs sorted by costs
 */
-static void search_pu_ibc(encoder_state_t * const state,
-  int x_cu, int y_cu,
-  int depth,
-  part_mode_t part_mode,
-  int i_pu,
+static void search_pu_ibc(
+  encoder_state_t * const state,
+  const cu_loc_t * const  cu_loc,
  unit_stats_map_t       *amvp,
  unit_stats_map_t       *merge,
  ibc_search_info_t      *info)
 {
  const uvg_config          *cfg      = &state->encoder_control->cfg;
  const videoframe_t * const frame    = state->tile->frame;
-  const int width_cu = LCU_WIDTH >> depth;
-  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height = PU_GET_H(part_mode, width_cu, i_pu);
-
-  // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
-  // nRx2N partitions.
-  const bool merge_a1 = i_pu == 0 || width >= height;
-  // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
-  // 2NxnD partitions.
-  const bool merge_b1 = i_pu == 0 || width <= height;
-
+  const int                  width_cu = cu_loc->width;
+  const int                  height_cu= cu_loc->height;

  lcu_t                     *lcu      = info->lcu;
-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int                  x_local  = SUB_SCU(cu_loc->x);
+  const int                  y_local  = SUB_SCU(cu_loc->y);
  cu_info_t                 *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
  cur_pu->type                        = CU_IBC;
-  cur_pu->part_size = part_mode;
-  cur_pu->depth = depth;
-  cur_pu->tr_depth = depth;
  cur_pu->qp                          = state->qp;
  cur_pu->inter.mv_dir                = 1;

@ -825,20 +827,20 @@ static void search_pu_ibc(encoder_state_t * const state,

  info->state    = state;
  info->pic      = frame->source;
-  info->origin.x       = x;
-  info->origin.y       = y;
-  info->width          = width;
-  info->height         = height;
-  info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info->optimized_sad  = uvg_get_optimized_sad(width);
+  info->origin.x = cu_loc->x;
+  info->origin.y = cu_loc->y;
+  info->width    = width_cu;
+  info->height   = height_cu;
+  info->mvd_cost_func =
+    cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
+  info->optimized_sad_y  = uvg_get_optimized_sad(width_cu);
+  info->optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
  info->lcu           = lcu;

  // Search for merge mode candidates
  info->num_merge_cand = uvg_inter_get_merge_cand(
                          state,
-                          x, y,
-                          width, height,
-                          merge_a1, merge_b1,
+                          cu_loc,
                          info->merge_cand,
                          lcu);

@ -853,7 +855,7 @@ static void search_pu_ibc(encoder_state_t * const state,
 #ifdef COMPLETE_PRED_MODE_BITS
  // Technically counting these bits would be correct, however counting
  // them universally degrades quality so this block is disabled by default
-  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0);
+  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL)], 0);
 #else
  const double no_skip_flag = 0;
 #endif
@ -875,7 +877,7 @@ static void search_pu_ibc(encoder_state_t * const state,
    {
      continue;
    }
-    uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    uvg_inter_pred_pu(state, info->lcu, true, false, cu_loc);
    merge->unit[merge->size] = *cur_pu;
    merge->unit[merge->size].type = CU_IBC;
    merge->unit[merge->size].merge_idx = merge_idx;
@ -883,11 +885,11 @@ static void search_pu_ibc(encoder_state_t * const state,
    merge->unit[merge->size].skipped = false;

    double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
-    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    if(state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
    }
    else {
-      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+      merge->cost[merge->size] = uvg_satd_any_size(width_cu, height_cu,
        lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
        lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
      bits += no_skip_flag;
@ -909,7 +911,7 @@ static void search_pu_ibc(encoder_state_t * const state,
    
  // Early Skip Mode Decision
  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+  if (cfg->early_skip) {
    for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
      if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
        merge->size = 1;
@ -919,6 +921,7 @@ static void search_pu_ibc(encoder_state_t * const state,
        merge->keys[0] = 0;
      }
      else if(cfg->rdo < 2) {
+        const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
        // Reconstruct blocks with merge candidate.
        // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
        // and chroma exists.
@ -927,19 +930,18 @@ static void search_pu_ibc(encoder_state_t * const state,
        cur_pu->inter.mv_dir    = info->merge_cand[merge_idx].dir;
        cur_pu->inter.mv[0][0]  = info->merge_cand[merge_idx].mv[0][0];
        cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
-        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);

-        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+        if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
          continue;
        }
        else if (has_chroma) {
-          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
          uvg_quantize_lcu_residual(state, false, has_chroma, 
            false, /*we are only checking for lack of coeffs so no need to check jccr*/
-            x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
-          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+            cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
+          if (!cbf_is_set_any(cur_pu->cbf)) {
            cur_pu->type = CU_IBC;
            cur_pu->merge_idx = merge_idx;
            cur_pu->skipped = true;
@ -965,14 +967,11 @@ static void search_pu_ibc(encoder_state_t * const state,
  // Do the motion search

  uvg_inter_get_mv_cand(info->state,    
-    info->origin.x,
-    info->origin.y,
-    info->width,
-    info->height,
    info->mv_cand,
    cur_pu,
    lcu,
-    NULL);
+    0,
+    cu_loc);

  vector2d_t best_mv = { 0, 0 };

@ -1003,9 +1002,7 @@ static void search_pu_ibc(encoder_state_t * const state,
    best_cost = calculate_ibc_cost_satd(
      info->state,
      lcu,
-      info->origin.x,
-      info->origin.y,
-      info->width,
+      cu_loc,
      (best_mv.x >> INTERNAL_MV_PREC),
      (best_mv.y >> INTERNAL_MV_PREC));
    best_cost += best_bits * info->state->lambda;
@ -1052,16 +1049,16 @@ static void search_pu_ibc(encoder_state_t * const state,
  };


-  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);    
+  if (state->encoder_control->cfg.rdo >= 2) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);    
  }


  if(cfg->rdo < 2) {
    int predmode_ctx;

-    const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3;
-    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
+    const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
    const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);

    const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@ -1077,33 +1074,29 @@ static void search_pu_ibc(encoder_state_t * const state,
 #include "threads.h"

 static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
-  int x, int y, int depth,
+  const cu_loc_t* cu_loc,
  lcu_t* lcu,
  double* inter_cost,
  double* inter_bitcost)
 {
-  const int x_cu = x;
-  const int y_cu = y;
+  const int x_cu = cu_loc->x;
+  const int y_cu = cu_loc->y;
  const int part_mode = SIZE_2Nx2N;
  const uvg_config          *cfg      = &state->encoder_control->cfg;
  const videoframe_t * const frame    = state->tile->frame;
-  const int                  width_cu = LCU_WIDTH >> depth;
-  const int                  width    = PU_GET_W(part_mode, width_cu, 0);
-  const int                  height   = PU_GET_H(part_mode, width_cu, 0);
+  const int                  width_cu = cu_loc->width;
+  const int                  height_cu = cu_loc->height;

  const bool                 merge_a1  = true;
  const bool                 merge_b1  = true;

  ibc_search_info_t info;

-  const int  x_local = SUB_SCU(x);
-  const int  y_local = SUB_SCU(y);
+  const int  x_local = SUB_SCU(x_cu);
+  const int  y_local = SUB_SCU(y_cu);
  cu_info_t *cur_pu  = LCU_GET_CU_AT_PX(lcu, x_local, y_local);

  cur_pu->type       = CU_IBC;
-  cur_pu->part_size  = part_mode;
-  cur_pu->depth      = depth;
-  cur_pu->tr_depth   = depth;
  cur_pu->qp         = state->qp;

  // Default to candidate 0
@ -1113,24 +1106,20 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,

  info.state    = state;
  info.pic      = frame->source;
-  info.origin.x = x;
-  info.origin.y = y;
-  info.width    = width;
-  info.height   = height;
+  info.origin.x = cu_loc->x;
+  info.origin.y = cu_loc->y;
+  info.width    = width_cu;
+  info.height   = height_cu;
  info.mvd_cost_func =
    cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
-  info.optimized_sad  = uvg_get_optimized_sad(width);
+  info.optimized_sad_y  = uvg_get_optimized_sad(width_cu);
+  info.optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
  info.lcu            = lcu;

  // Search for merge mode candidates
  info.num_merge_cand = uvg_inter_get_merge_cand(
    state,
-    x,
-    y,
-    width,
-    height,
-    merge_a1,
-    merge_b1,
+    cu_loc,
    info.merge_cand,
    lcu);

@ -1145,17 +1134,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
  static int    evaluations = 0;
  static int hits = 0;

-
-  UVG_CLOCK_T   hashmap_start_temp;
-  UVG_CLOCK_T   hashmap_end_temp;
-
-
  UVG_CLOCK_T   hashmap_start_real_time;
  UVG_CLOCK_T   hashmap_end_real_time;
  UVG_GET_TIME(&hashmap_start_real_time);

-  int           xx  = x;
-  int           yy  = y;
+  int           xx  = x_cu;
+  int           yy  = y_cu;

  int           best_mv_x    = INT_MAX>>2;
  int           best_mv_y    = INT_MAX>>2;
@ -1185,12 +1169,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
      int pos_y = result->value & 0xffff;
      int mv_x = pos_x - xx;
      int mv_y = pos_y - yy;
-      if (pos_x <= xx - width && pos_y <= yy - height) {
+      if (pos_x <= xx - width_cu && pos_y <= yy - height_cu) {
        valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y);
        if (valid_mv) {
          bool full_block = true; // Is the full block covered by the IBC?
-          for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
-            for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) {
+          for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width_cu; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
+            for (int offset_y = 0; offset_y < height_cu; offset_y += UVG_HASHMAP_BLOCKSIZE) {
              uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[
                ((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE];

@ -1211,7 +1195,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
          if (full_block) {
            double     cost = ibc_cost, bits = ibc_bitcost;
            vector2d_t mv = { best_mv_x, best_mv_y};
-            cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, NULL, &bits);
+            cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, 0, &bits);
            //double cost    = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt;
            //cost += 
            bool better_mv = cost < ibc_cost;
@ -1220,7 +1204,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
              best_mv_y              = mv_y;
              ibc_cost               = cost;
              ibc_bitcost            = bits;
-              fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y);
+              fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x_cu,y_cu, width_cu,height_cu, mv_x, mv_y);
              found_block = true;
              //break;
            }
@ -1238,7 +1222,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
  //if (x > state->tile->frame->width-64 && y > state->tile->frame->height-64)
    //fprintf(stderr, "Hashmap time: %f (crc: %f, search: %f) Evaluations: %d Hits: %d, hashed in this block: %d\n", time_spent,crc_time, search_time, evaluations, hits,hashes_found);
   
-  if (!found_block) return;
+  if (!found_block) return 0;

  *inter_cost    = ibc_cost;
  *inter_bitcost = ibc_bitcost;
@ -1267,18 +1251,16 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
  cur_pu->skipped = merged;
  

-  const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
+  const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
  ibc_cost += ibc_flag * state->lambda;
  ibc_bitcost += ibc_flag;

  uvg_inter_recon_cu(
    state,
    lcu,
-    x,
-    y,
-    CU_WIDTH_FROM_DEPTH(depth),
    true,
-    state->encoder_control->chroma_format != UVG_CSP_400);
+    state->encoder_control->chroma_format != UVG_CSP_400,
+    cu_loc);

  if (*inter_cost < MAX_DOUBLE) {
    assert(fracmv_within_ibc_range(
@ -1286,7 +1268,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
      cur_pu->inter.mv[0][0],
      cur_pu->inter.mv[0][1]));
  }
-
+  return 1;
 }


@ -1305,17 +1287,18 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
 * \param inter_bitcost Return inter bitcost
 */
 void uvg_search_cu_ibc(encoder_state_t * const state,
-                         int x, int y, int depth,
+                       const cu_loc_t * const  cu_loc,
                       lcu_t *lcu,
                       double   *inter_cost,
                       double* inter_bitcost)
 {
  *inter_cost = MAX_DOUBLE;
  *inter_bitcost = MAX_INT;
+
   // Quick hashmap search
  /* uvg_search_hash_cu_ibc(
    state,
-                          x, y, depth,
+                          cu_loc,
                          lcu,
                          inter_cost,
                          inter_bitcost);
@ -1330,8 +1313,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
  info.lcu = lcu;

  search_pu_ibc(state,
-                  x, y, depth,
-                  SIZE_2Nx2N, 0,
+                  cu_loc,
                  amvp,
                  &merge,
                  &info);
@ -1374,14 +1356,14 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
    return;
  }

-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int  x_local = SUB_SCU(cu_loc->x);
+  const int  y_local = SUB_SCU(cu_loc->y);
  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
  *cur_pu = *best_inter_pu;
  cur_pu->type       = CU_IBC;

-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
-    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+  uvg_inter_recon_cu(state, lcu, 
+    true, state->encoder_control->chroma_format != UVG_CSP_400, cu_loc);   

  if (*inter_cost < MAX_DOUBLE) {    
    assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
--- a/src/search_ibc.h
+++ b/src/search_ibc.h
@ -46,7 +46,7 @@


 void uvg_search_cu_ibc(encoder_state_t * const state,
-                         int x, int y, int depth,
+                         const cu_loc_t * const  cu_loc,
                         lcu_t *lcu,
                         double *inter_cost,
                         double* inter_bitcost);
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -1293,8 +1293,8 @@ static void apply_mv_scaling(int32_t current_poc,
 /**
 * \brief Perform inter search for a single reference frame.
 */
-static void search_pu_inter_ref(inter_search_info_t *info,
-  int depth,
+static void search_pu_inter_ref(
+  inter_search_info_t *info,
  lcu_t *lcu,
  cu_info_t *cur_cu,
  unit_stats_map_t *amvp)
@ -1327,15 +1327,15 @@ static void search_pu_inter_ref(inter_search_info_t *info,
  // Get MV candidates
  cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list];

+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
+
  uvg_inter_get_mv_cand(info->state,
-    info->origin.x,
-    info->origin.y,
-    info->width,
-    info->height,
                        info->mv_cand,
                        cur_cu,
                        lcu,
-    ref_list);
+                        ref_list,
+                        &cu_loc);

  vector2d_t best_mv = { 0, 0 };

@ -1498,11 +1498,13 @@ static void search_pu_inter_ref(inter_search_info_t *info,
 /**
 * \brief Search bipred modes for a PU.
 */
-static void search_pu_inter_bipred(inter_search_info_t *info,
-                                   int depth,
+static void search_pu_inter_bipred(
+  inter_search_info_t *info,
  lcu_t *lcu,
  unit_stats_map_t *amvp_bipred)
 {
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
  const image_list_t *const ref = info->state->frame->ref;
  uint8_t (*ref_LX)[16] = info->state->frame->ref_LX;
  const videoframe_t * const frame = info->state->tile->frame;
@ -1551,7 +1553,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
    bipred_pu->skipped = false;

    for (int reflist = 0; reflist < 2; reflist++) {
-      uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+      uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, &cu_loc);
    }

    // Don't try merge candidates that don't satisfy mv constraints.
@ -1564,13 +1566,11 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
    uvg_inter_recon_bipred(info->state,
                           ref->images[ref_LX[0][merge_cand[i].ref[0]]],
                           ref->images[ref_LX[1][merge_cand[j].ref[1]]],
-                           x, y,
-                           width,
-                           height,
                           mv,
                           lcu,
                           true,
-                           false);
+                           false,
+                           &cu_loc);

    const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
    const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride];
@ -1666,11 +1666,9 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
 * \param amvp        Return searched AMVP PUs sorted by costs
 * \param merge       Return searched Merge PUs sorted by costs
 */
-static void search_pu_inter(encoder_state_t * const state,
-  int x_cu, int y_cu,
-  int depth,
-  part_mode_t part_mode,
-  int i_pu,
+static void search_pu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
  lcu_t *lcu,
  unit_stats_map_t *amvp,
  unit_stats_map_t *merge,
@ -1678,25 +1676,14 @@ static void search_pu_inter(encoder_state_t * const state,
 {
  const uvg_config *cfg = &state->encoder_control->cfg;
  const videoframe_t * const frame = state->tile->frame;
-  const int width_cu = LCU_WIDTH >> depth;
-  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height = PU_GET_H(part_mode, width_cu, i_pu);
+  const int width_cu = cu_loc->width;
+  const int height_cu = cu_loc->height; 

-  // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
-  // nRx2N partitions.
-  const bool merge_a1 = i_pu == 0 || width >= height;
-  // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
-  // 2NxnD partitions.
-  const bool merge_b1 = i_pu == 0 || width <= height;

-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(cu_loc->x);
+  const int y_local = SUB_SCU(cu_loc->y);
  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
  cur_pu->type = CU_NOTSET;
-  cur_pu->part_size = part_mode;
-  cur_pu->depth = depth;
  cur_pu->qp = state->qp;

  // Default to candidate 0
@ -1707,19 +1694,17 @@ static void search_pu_inter(encoder_state_t * const state,

  info->state          = state;
  info->pic            = frame->source;
-  info->origin.x       = x;
-  info->origin.y       = y;
-  info->width          = width;
-  info->height         = height;
+  info->origin.x       = cu_loc->x;
+  info->origin.y       = cu_loc->y;
+  info->width          = width_cu;
+  info->height         = height_cu;
  info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost;
-  info->optimized_sad  = uvg_get_optimized_sad(width);
+  info->optimized_sad  = uvg_get_optimized_sad(width_cu);

  // Search for merge mode candidates
  info->num_merge_cand = uvg_inter_get_merge_cand(
      state,
-      x, y,
-      width, height,
-      merge_a1, merge_b1,
+      cu_loc,
      info->merge_cand,
      lcu
  );
@ -1754,7 +1739,7 @@ static void search_pu_inter(encoder_state_t * const state,
    // If bipred is not enabled, do not try candidates with mv_dir == 3.
    // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. 
    if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
-    if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue;
+    if (cur_pu->inter.mv_dir == 3 && !(cu_loc->width + cu_loc->height > 12)) continue;

    bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge);

@ -1768,7 +1753,7 @@ static void search_pu_inter(encoder_state_t * const state,
    {
      continue;
    }
-    uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    uvg_inter_pred_pu(state, lcu, true, false, cu_loc);
    merge->unit[merge->size] = *cur_pu;
    merge->unit[merge->size].type = CU_INTER;
    merge->unit[merge->size].merge_idx = merge_idx;
@ -1776,11 +1761,11 @@ static void search_pu_inter(encoder_state_t * const state,
    merge->unit[merge->size].skipped = false;

    double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
-    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    if(state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
    }
    else {
-      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+      merge->cost[merge->size] = uvg_satd_any_size(cu_loc->width, cu_loc->height,
        lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
        lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
      bits += no_skip_flag;
@ -1802,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state,
    
  // Early Skip Mode Decision
  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+  if (cfg->early_skip) {
    for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
      if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
        merge->size = 1;
@ -1812,6 +1797,8 @@ static void search_pu_inter(encoder_state_t * const state,
        merge->keys[0] = 0;
      }
      else if(cfg->rdo < 2) {
+
+        const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
        // Reconstruct blocks with merge candidate.
        // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
        // and chroma exists.
@ -1824,22 +1811,22 @@ static void search_pu_inter(encoder_state_t * const state,
        cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
        cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
        cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
-        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
+        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);

-        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
+
+        if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
          continue;
        }
        else if (has_chroma) {
-          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
          uvg_quantize_lcu_residual(state,
                                    false, has_chroma,
                                    false, /*we are only checking for lack of coeffs so no need to check jccr*/
-                                    x, y, depth, cur_pu, lcu,
+                                    cu_loc, cur_pu, lcu,
                                    true,
                                    UVG_BOTH_T);
-          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+          if (!cbf_is_set_any(cur_pu->cbf)) {
            cur_pu->type = CU_INTER;
            cur_pu->merge_idx = merge_idx;
            cur_pu->skipped = true;
@ -1871,7 +1858,7 @@ static void search_pu_inter(encoder_state_t * const state,
    info->ref_idx = ref_idx;
    info->ref = state->frame->ref->images[ref_idx];

-    search_pu_inter_ref(info, depth, lcu, cur_pu, amvp);
+    search_pu_inter_ref(info, lcu, cur_pu, amvp);
  }

  assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE);
@ -1936,14 +1923,11 @@ static void search_pu_inter(encoder_state_t * const state,
        info->ref = ref->images[info->ref_idx];

        uvg_inter_get_mv_cand(info->state,
-          info->origin.x,
-          info->origin.y,
-          info->width,
-          info->height,
                              info->mv_cand,
                              unipred_pu,
                              lcu,
-          list);
+                              list,
+                              cu_loc);

        double     frac_cost = MAX_DOUBLE;
        double   frac_bits = MAX_INT;
@ -1964,8 +1948,8 @@ static void search_pu_inter(encoder_state_t * const state,
          unipred_pu->inter.mv[list][1] = frac_mv.y;
          CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand);

-          if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-            uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits);
+          if (state->encoder_control->cfg.rdo >= 2) {
+            uvg_cu_cost_inter_rd2(state, unipred_pu, lcu, &frac_cost, &frac_bits, cu_loc);
          }

          amvp[list].cost[key] = frac_cost;
@ -1987,15 +1971,15 @@ static void search_pu_inter(encoder_state_t * const state,
    amvp[list].size = n_best;
  }

-  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) {
-    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
-    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]);
+  if (state->encoder_control->cfg.rdo >= 2 && cfg->fme_level == 0) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);
+    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]], cu_loc);
  }

  // Search bi-pred positions
  bool can_use_bipred = state->frame->slicetype == UVG_SLICE_B
    && cfg->bipred
-    && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
+    && cu_loc->width + cu_loc->height >= 16; // 4x8 and 8x4 PBs are restricted to unipred

  if (can_use_bipred) {

@ -2026,25 +2010,23 @@ static void search_pu_inter(encoder_state_t * const state,
      bipred_pu->skipped = false;

      for (int reflist = 0; reflist < 2; reflist++) {
-        uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+        uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, cu_loc);
      }

      uvg_inter_recon_bipred(info->state,
                             ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
                             ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
-        x, y,
-        width,
-        height,
-        mv,
-        lcu,
+                             mv, lcu,
                             true,
-        false);
+                             false,
+                             cu_loc
+        );

-      const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
-      const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
+      const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
+      const uvg_pixel *src = &lcu->ref.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];

      best_bipred_cost =
-        uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
+        uvg_satd_any_size(cu_loc->width, cu_loc->height, rec, LCU_WIDTH, src, LCU_WIDTH);

      double bitcost[2] = { 0, 0 };

@ -2091,17 +2073,17 @@ static void search_pu_inter(encoder_state_t * const state,
    }

    // TODO: this probably should have a separate command line option
-    if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]);
+    if (cfg->rdo >= 3) search_pu_inter_bipred(info, lcu, &amvp[2]);
    
    assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE);
    uvg_sort_keys_by_cost(&amvp[2]);
-    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
-      uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]);
+    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2) {
+      uvg_cu_cost_inter_rd2(state, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]], cu_loc);
    }
  }
  if(cfg->rdo < 2) {
    int predmode_ctx;
-    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
    const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);

    const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@ -2135,22 +2117,19 @@ static void search_pu_inter(encoder_state_t * const state,
 * \param inter_cost    Return inter cost
 * \param inter_bitcost Return inter bitcost
 */
-void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
-                           int x, int y, int depth,
+void uvg_cu_cost_inter_rd2(
+  encoder_state_t * const state,
  cu_info_t* cur_cu,
  lcu_t *lcu,
  double   *inter_cost,
-                           double* inter_bitcost){
+  double* inter_bitcost,
+  const cu_loc_t* const cu_loc){
  
-  int tr_depth = MAX(1, depth);
-  if (cur_cu->part_size != SIZE_2Nx2N) {
-    tr_depth = depth + 1;
-  }
-  uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, UVG_BOTH_T);
+  const int x_px = SUB_SCU(cu_loc->x);
+  const int y_px = SUB_SCU(cu_loc->y);
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;

-  const int x_px = SUB_SCU(x);
-  const int y_px = SUB_SCU(y);
-  const int width = LCU_WIDTH >> depth;
  cabac_data_t cabac_copy;
  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
  cabac_data_t* cabac = &state->search_cabac;
@ -2160,31 +2139,43 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
  *cur_pu = *cur_cu;

  const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
+  uvg_inter_recon_cu(state, lcu, true, reconstruct_chroma, cu_loc);

  int index = y_px * LCU_WIDTH + x_px;
  double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                   LCU_WIDTH, LCU_WIDTH,
-                                   width) * UVG_LUMA_MULT;
+                                   width, height) * UVG_LUMA_MULT;
  if (reconstruct_chroma) {
    int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
    double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                       LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width, cu_loc->chroma_height);
    double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                       LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width, cu_loc->chroma_height);
    ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
  }
  double no_cbf_bits;
  double bits = 0;
-  const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL);
-  if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+  const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
+
+  int8_t depth = 0;
+  int8_t mtt_depth = 0;
+  uint32_t splits = cur_cu->split_tree;
+  while (splits & 7) {
+    if ((splits & 7) != QT_SPLIT) {
+      mtt_depth++;
+    }
+    depth++;
+    splits >>= 3;
+  }
+  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth, 0};
+  if (cur_cu->merged) {
    no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
  }
  else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
    bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
  }
  double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
@ -2194,20 +2185,20 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
    state->encoder_control->cfg.chroma_trskip_enable;

  double chroma_cost = 0;
-  if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && cur_cu->depth == cur_cu->tr_depth && reconstruct_chroma) {
+  if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && PU_IS_TU(cur_cu) && reconstruct_chroma) {
    uvg_quantize_lcu_residual(state,
                              true,
                              false,
-                              false, x, y,
-                              depth,
+                              false,
+                              cu_loc,
                              cur_cu,
                              lcu,
                              false,
                              UVG_BOTH_T);
    ALIGNED(64) uvg_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
    ALIGNED(64) uvg_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
-    uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, width, LCU_WIDTH_C, width);
-    uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, width, LCU_WIDTH_C, width);
+    uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, height, LCU_WIDTH_C, width);
+    uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, height, LCU_WIDTH_C, width);
    ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
    ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];

@ -2216,6 +2207,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
      u_pred,
      u_resi,
      width,
+      height,
      LCU_WIDTH_C,
      width);
    uvg_generate_residual(
@ -2223,19 +2215,17 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
      v_pred,
      v_resi,
      width,
+      height,
      LCU_WIDTH_C,
      width);

    uvg_chorma_ts_out_t chorma_ts_out;
    uvg_chroma_transform_search(
      state,
-      depth,
      lcu,
      &cabac_copy,
-      width,
-      width,
+      cu_loc,
      index,
-      0,
      cur_cu,
      u_pred,
      v_pred,
@ -2243,41 +2233,41 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
      v_resi,
      &chorma_ts_out,
      UVG_BOTH_T);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+    cbf_clear(&cur_cu->cbf, COLOR_U);
+    cbf_clear(&cur_cu->cbf, COLOR_V);
    if (chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) {
      cur_cu->joint_cb_cr = 0;
      cur_cu->tr_skip |= (chorma_ts_out.best_u_index == CHROMA_TS) << COLOR_U;
      cur_cu->tr_skip |= (chorma_ts_out.best_v_index == CHROMA_TS) << COLOR_V;
-      if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_U);
+      if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_V);
      chroma_cost += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost;
    }
    else {
      cur_cu->joint_cb_cr = chorma_ts_out.best_combined_index;
-      if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, COLOR_U);
+      if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, COLOR_V);
      chroma_cost += chorma_ts_out.best_combined_cost;
    }
  }
  else {
    uvg_quantize_lcu_residual(state,
                              true, reconstruct_chroma,
-                              reconstruct_chroma && state->encoder_control->cfg.jccr, x, y,
-                              depth,
+                              reconstruct_chroma && state->encoder_control->cfg.jccr,
+                              cu_loc,
                              cur_cu,
                              lcu,
                              false,
                              UVG_BOTH_T);    
  }

-  int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+  int cbf = cbf_is_set_any(cur_cu->cbf);
  
  if(cbf) {
-    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu);
+    *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
    if (reconstruct_chroma) {
-      if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
-        *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
+      if (!PU_IS_TU(cur_cu) || !state->encoder_control->cfg.jccr) {
+        *inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
      }
      else {
        *inter_cost += chroma_cost;
@ -2297,7 +2287,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,

  if(no_cbf_cost < *inter_cost) {
    cur_cu->cbf = 0;
-    if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+    if (cur_cu->merged) {
      cur_cu->skipped = 1;
    }
    *inter_cost = no_cbf_cost;
@ -2321,8 +2311,9 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
 * \param inter_cost    Return inter cost
 * \param inter_bitcost Return inter bitcost
 */
-void uvg_search_cu_inter(encoder_state_t * const state,
-                         int x, int y, int depth,
+void uvg_search_cu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
  lcu_t *lcu,
  double   *inter_cost,
  double* inter_bitcost)
@ -2338,12 +2329,8 @@ void uvg_search_cu_inter(encoder_state_t * const state,
  inter_search_info_t info;

  search_pu_inter(state,
-                  x, y, depth,
-                  SIZE_2Nx2N, 0,
-                  lcu,
-                  amvp,
-                  &merge,
-                  &info);
+                  cu_loc, lcu, amvp,
+                  &merge, &info);

  // Early Skip CU decision
  if (merge.size == 1 && merge.unit[0].skipped) {
@ -2385,13 +2372,14 @@ void uvg_search_cu_inter(encoder_state_t * const state,
    return;
  }

-  const int x_local = SUB_SCU(x);
-  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(cu_loc->x);
+  const int y_local = SUB_SCU(cu_loc->y);
  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
  *cur_pu = *best_inter_pu;

-  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
-    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+  uvg_inter_recon_cu(state, lcu,
+                     true, state->encoder_control->chroma_format != UVG_CSP_400,
+                     cu_loc);   

  if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
    assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
--- a/src/search_inter.h
+++ b/src/search_inter.h
@ -73,8 +73,9 @@ typedef double uvg_mvd_cost_func(const encoder_state_t *state,
                                  int32_t ref_idx,
                                  double *bitcost);

-void uvg_search_cu_inter(encoder_state_t * const state,
-                         int x, int y, int depth,
+void uvg_search_cu_inter(
+  encoder_state_t * const state,
+  const cu_loc_t* const cu_loc,
  lcu_t *lcu,
  double *inter_cost,
  double* inter_bitcost);
@ -85,12 +86,13 @@ unsigned uvg_inter_satd_cost(const encoder_state_t* state,
                             const lcu_t *lcu,
                             int x,
                             int y);
-void uvg_cu_cost_inter_rd2(encoder_state_t* const state,
-  int x, int y, int depth,
+void uvg_cu_cost_inter_rd2(
+  encoder_state_t* const state,
  cu_info_t* cur_cu,
  lcu_t* lcu,
  double* inter_cost,
-  double* inter_bitcost);
+  double* inter_bitcost,
+  const cu_loc_t* const cu_loc);

 int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);

--- a/src/search_intra.c
+++ b/src/search_intra.c
--- a/src/search_intra.h
+++ b/src/search_intra.h
@ -43,27 +43,27 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"

-double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu);
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t*
+                          const cu_loc,
+                          const lcu_t* lcu);
                       
 double uvg_chroma_mode_bits(const encoder_state_t *state,
                        int8_t chroma_mode, int8_t luma_mode);

 int8_t uvg_search_cu_intra_chroma(
  encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
  lcu_t *lcu,
  intra_search_data_t* best_cclm,
-  enum uvg_tree_type tree_type);
+  int8_t luma_mode,
+  enum uvg_tree_type tree_type,
+  bool is_separate);

 void uvg_search_cu_intra(
  encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
  intra_search_data_t* search_data,
  lcu_t *lcu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc);

 #endif // SEARCH_INTRA_H_
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
--- a/src/strategies/avx2/depquant-avx2.c
+++ b/src/strategies/avx2/depquant-avx2.c
--- a/src/strategies/avx2/depquant-avx2.h
+++ b/src/strategies/avx2/depquant-avx2.h
@ -0,0 +1,46 @@
+#ifndef STRATEGIES_DEPQUANT_AVX2_H_
+#define STRATEGIES_DEPQUANT_AVX2_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for AVX2.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_AVX2_H_
--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@ -38,13 +38,14 @@
 * Functions for writing the coding quadtree and related syntax.
 */

+#include "cu.h"
 #include "encoderstate.h"
 #include "global.h"

 void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                               cabac_data_t * const cabac,
                               const coeff_t *coeff,
-                               uint8_t width,
+                               const cu_loc_t *loc,
                               uint8_t type,
                               int8_t scan_mode,
                               int8_t tr_skip,
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@ -42,10 +42,9 @@
 #include "strategyselector.h"
 #include "strategies/missing-intel-intrinsics.h"

-
 /**
 * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU locationand size data.
 * \param intra_mode    Angular mode in range 2..34.
 * \param channel_type  Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
@ -54,20 +53,28 @@
 * \param multi_ref_idx Reference line index for use with MRL.
 */
 static void uvg_angular_pred_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  const int_fast8_t intra_mode,
  const int_fast8_t channel_type,
  const uvg_pixel *const in_ref_above,
  const uvg_pixel *const in_ref_left,
  uvg_pixel *const dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode,
+  const int cu_dim)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];

-  assert(log2_width >= 2 && log2_width <= 5);
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
  assert(intra_mode >= 2 && intra_mode <= 66);

  // TODO: implement handling of MRL
  uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t isp = isp_mode;

  __m256i p_shuf_01 = _mm256_setr_epi8(
    0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04,
@ -142,7 +149,6 @@ static void uvg_angular_pred_avx2(
  //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 };
  uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
  uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;

  int32_t pred_mode = intra_mode; // ToDo: handle WAIP

@ -345,13 +351,13 @@ static void uvg_angular_pred_avx2(

     
      // PDPC
-      bool PDPC_filter = (width >= 4 || channel_type != 0);
+      bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0);
      if (pred_mode > 1 && pred_mode < 67) {
        if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
          PDPC_filter = false;
        }
        else if (mode_disp > 0) {
-          PDPC_filter = (scale >= 0);
+          PDPC_filter &= (scale >= 0);
        }
      }
      if(PDPC_filter) {
@ -497,20 +503,27 @@ static void uvg_angular_pred_avx2(

 /**
 * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
 * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_intra_pred_planar_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
  const uint8_t *const ref_top,
  const uint8_t *const ref_left,
  uint8_t *const dst)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));

-  const int_fast8_t width = 1 << log2_width;
  const uint8_t top_right = ref_top[width + 1];
  const uint8_t bottom_left = ref_left[width + 1];

@ -964,12 +977,17 @@ static void uvg_intra_pred_filtered_dc_avx2(
 */
 static void uvg_pdpc_planar_dc_avx2(
  const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
  const uvg_intra_ref *const used_ref,
  uvg_pixel *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
  assert(mode == 0 || mode == 1);  // planar or DC
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width =  uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];

  __m256i shuf_mask_byte = _mm256_setr_epi8(
    0, -1, 0, -1, 0, -1, 0, -1,
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)

 static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
                 const int ref_stride, const int rec_stride,
-                 const int width)
+                 const int width, const int height)
 {
+  assert(width == height && "Non square not yet implemented");
  __m256i ssd_part;
  __m256i diff = _mm256_setzero_si256();
  __m128i sum;
@ -1743,40 +1744,32 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
  return diff;
 }

-static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
-
+static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) {
+  // ISP_TODO: non-square block implementation, height is passed but not used
  __m128i diff = _mm_setzero_si128();
  switch (width) {
  case 4:
-    diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[0]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[4]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[8]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride);
-    _mm_storel_epi64((__m128i*) & (residual[12]), diff);
+    for (int y = 0; y < height; y+=4) {
+      diff = get_residual_4x1_avx2(ref_in + y * ref_stride, pred_in + y * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 1) * ref_stride, pred_in + (y + 1) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 4]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 2) * ref_stride, pred_in + (y + 2) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 8]), diff);
+      diff = get_residual_4x1_avx2(ref_in + (y + 3) * ref_stride, pred_in + (y + 3) * pred_stride);
+      _mm_storel_epi64((__m128i*) & (residual[y * 4 + 12]), diff);
+    }
    break;
  case 8:
-    diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[0]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[8]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[16]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[24]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[32]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[40]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[48]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]);
-    _mm_storeu_si128((__m128i*) & (residual[56]), diff);
+    for (int y = 0; y < height; y += 2) {
+      diff = get_residual_8x1_avx2(&ref_in[y * ref_stride], &pred_in[y * pred_stride]);
+      _mm_storeu_si128((__m128i*) & (residual[y * 8]), diff);
+      diff = get_residual_8x1_avx2(&ref_in[(y + 1) * ref_stride], &pred_in[(y + 1) * pred_stride]);
+      _mm_storeu_si128((__m128i*) & (residual[y*8 + 8]), diff);
+    }
    break;
  default:
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
      for (int x = 0; x < width; x += 16) {
        diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]);
        _mm_storeu_si128((__m128i*) & residual[x + y * width], diff);
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -380,20 +380,24 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx)
 {
  const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);

  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  uint32_t log2_tr_width = uvg_math_floor_log2(height);
-  uint32_t log2_tr_height = uvg_math_floor_log2(width);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+  
  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
  const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
  const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
  const int32_t q_bits8 = q_bits - 8;

+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+
  uint32_t ac_sum = 0;
  int32_t last_cg = -1;

@ -402,7 +406,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
  // Loading once is enough if scaling lists are not off
  __m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256();
  if (!(state->encoder_control->scaling_list.enable)) {
-    low_b  = _mm256_set1_epi32(quant_coeff[0]);
+    low_b  = _mm256_set1_epi32(default_quant_coeff);
    high_b = low_b;
  }

@ -579,8 +583,9 @@ static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint
  return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec));
 }

-static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){
+static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width, int height){

+  if (height == width || width >= 16) {
    switch (width) {
    case 4:
      *(int32_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
@ -599,7 +604,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
      *(int64_t*)& (rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
      break;
    default:
-      for (int y = 0; y < width; ++y) {
+      for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; x += 16) {
          *(int64_t*)& (rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride);
          *(int64_t*)& (rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride);
@ -608,6 +613,32 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
      break;
    }
  }
+  else {
+    switch (width) {
+    case 4:
+      for (int y = 0; y < height; y += 4) {
+        *(int32_t*)& (rec_out[(y + 0) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 0) * width, pred_in + (y + 0) * in_stride);
+        *(int32_t*)& (rec_out[(y + 1) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 1) * width, pred_in + (y + 1) * in_stride);
+        *(int32_t*)& (rec_out[(y + 2) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 2) * width, pred_in + (y + 2) * in_stride);
+        *(int32_t*)& (rec_out[(y + 3) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 3) * width, pred_in + (y + 3) * in_stride);
+      }
+      break;
+    case 8:
+      for (int y = 0; y < height; ++y) {
+        *(int32_t*)& (rec_out[y * out_stride]) = get_quantized_recon_8x1_avx2(residual + y * width, pred_in + y * in_stride);
+      }
+      break;
+    default:
+      for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+          int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
+          rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
+        }
+      }
+      break;
+    }
+  }
+}

 /**
 * \brief Quantize residual and get both the reconstruction and coeffs.
@ -626,7 +657,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_avx2(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uint8_t *const ref_in, const uint8_t *const pred_in,
@ -637,15 +668,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
  // Temporary arrays to pass data to and from uvg_quant and transform functions.
  ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  // ISP_TODO: non-square block implementation, height is passed but not used
  
-  const int height = width; // TODO: height for non-square blocks
  int has_coeffs = 0;

  assert(width <= TR_MAX_WIDTH);
  assert(width >= TR_MIN_WIDTH);

  // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
    int y, x;
@ -662,40 +693,51 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,

  // Transform residual. (residual -> coeff)
  if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
  }
  else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
  }

  const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;

  if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
    // Forward low frequency non-separable transform
-    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
  }

  // Quantize coeffs. (coeff -> coeff_out)
-  
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if(!use_trskip && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      color,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
      (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
  {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, color,
-      scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
+    uvg_rdoq(state, coeff, coeff_out, width, height, color,
+      scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
  }
  else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
      scan_order);
  }
  else {
-    uvg_quant(state, coeff, coeff_out, width, width, color,
+    uvg_quant(state, coeff, coeff_out, width, height, color,
      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
  }

  // Check if there are any non-zero coefficients.
-  for (int i = 0; i < width * width; i += 8) {
+  for (int i = 0; i < width * height; i += 8) {
    __m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(coeff_out[i]));
    has_coeffs = !_mm_testz_si128(_mm_set1_epi8(0xFF), v_quant_coeff);
    if(has_coeffs) break;
@ -705,25 +747,25 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
  // rec_out.
  if (has_coeffs && !early_skip) {
    // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, color,
+    uvg_dequant(state, coeff_out, coeff, width, height, color,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);

    if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
      // Inverse low frequency non-separable transform
-      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
    }
    if (use_trskip) {
-      uvg_itransformskip(state->encoder_control, residual, coeff, width);
+      uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
    }
    else {
-      uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+      uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
    }

    if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
      int y, x;
      int sign, absval;
      int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-      for (y = 0; y < width; ++y) {
+      for (y = 0; y < height; ++y) {
        for (x = 0; x < width; ++x) {
          residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
          sign = residual[x + y * width] >= 0 ? 1 : -1;
@ -739,14 +781,14 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
    }

    // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width);
+    get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width, height);
  }
  else if (rec_out != pred_in) {
    // With no coeffs and rec_out == pred_int we skip copying the coefficients
    // because the reconstruction is just the prediction.
    int y, x;

-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
      }
@ -763,20 +805,26 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
  const encoder_control_t * const encoder = state->encoder_control;
+  if (encoder->cfg.dep_quant && !transform_skip) {
+    uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
+    return;
+  }
  int32_t shift,add,coeff_q;
  int32_t n;
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
+  const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size// Represents scaling through forward transform


  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;

-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);

  if (encoder->scaling_list.enable)
  {
-    uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
-    uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);

    const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6];
@ -797,7 +845,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
      }
    }
  } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
    add = 1 << (shift-1);

    __m256i v_scale = _mm256_set1_epi32(scale);
@ -845,8 +893,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
  return parts[0] + parts[1] + parts[2] + parts[3];
 }

-static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
  const __m256i zero           = _mm256_setzero_si256();
  const __m256i threes         = _mm256_set1_epi16(3);
  const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
@ -863,7 +912,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64
  __m256i wts_lo     = _mm256_broadcastsi128_si256(wts_lo_128);
  __m256i wts_hi     = _mm256_broadcastsi128_si256(wts_hi_128);

-  for (int i = 0; i < width * width; i += 32) {
+  for (int i = 0; i < width * height; i += 32) {
    __m256i curr_lo      = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
    __m256i curr_abs_lo  = _mm256_abs_epi16   (curr_lo);
    __m256i curr_max3_lo = _mm256_min_epu16   (curr_abs_lo, threes);
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@ -771,6 +771,12 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,


 // DCT-2
+#define DEFINE_DCT2_P2_MATRIX(a) \
+{ \
+   a,  a, \
+   a, -a  \
+}
+
 #define DEFINE_DCT2_P4_MATRIX(a,b,c) \
 { \
   a,  a,  a,  a, \
@ -1002,6 +1008,7 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
 }

 // DCT-2
+const int16_t uvg_g_DCT2P2[4] = DEFINE_DCT2_P2_MATRIX(64);
 const int16_t uvg_g_DCT2P4[16] = DEFINE_DCT2_P4_MATRIX(64, 83, 36);
 const int16_t uvg_g_DCT2P8[64] = DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18);
 const int16_t uvg_g_DCT2P16[256] = DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9);
@ -1020,6 +1027,68 @@ const int16_t uvg_g_DCT8P16[256] = DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77
 const int16_t uvg_g_DCT8P32[1024] = DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4);

 // ********************************** DCT-2 **********************************
+static void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
+{
+  int32_t j;
+  int32_t E, O;
+  int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0;
+
+  const int16_t* iT = uvg_g_DCT2P2;
+
+  int16_t *p_coef = dst;
+  const int  reduced_line = line - skip_line;
+  for (j = 0; j < reduced_line; j++)
+  {
+    /* E and O */
+    E = src[0] + src[1];
+    O = src[0] - src[1];
+
+    dst[0] = (iT[0] * E + add) >> shift;
+    dst[line] = (iT[2] * O + add) >> shift;
+
+
+    src += 2;
+    dst++;
+  }
+  if (skip_line)
+  {
+    dst = p_coef + reduced_line;
+    for (j = 0; j < 2; j++)
+    {
+      memset(dst, 0, sizeof(int16_t) * skip_line);
+      dst += line;
+    }
+  }
+}
+
+static void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2)
+{
+  int32_t j;
+  int32_t E, O;
+  int32_t add = 1 << (shift - 1);
+
+  const int16_t* iT = uvg_g_DCT2P2;
+
+  const int  reduced_line = line - skip_line;
+  for (j = 0; j < reduced_line; j++)
+  {
+    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+    E = iT[0] * (src[0] + src[line]);
+    O = iT[2] * (src[0] - src[line]);
+
+    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
+    dst[0] = (short)CLIP(-32768, 32767, (E + add) >> shift);
+    dst[1] = (short)CLIP(-32768, 32767, (O + add) >> shift);
+
+    src++;
+    dst += 2;
+  }
+  if (skip_line)
+  {
+    memset(dst, 0, (skip_line << 1) * sizeof(int16_t));
+  }
+}
+
 static void fastForwardDCT2_B4(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
 {
  int32_t j;
@ -1366,11 +1435,6 @@ static void fastForwardDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift,
      dst += line;
    }
  }
-  if (skip_line2) {
-    const int  reduced_line = line - skip_line2;
-    dst = p_coef + reduced_line * 32;
-    memset(dst, 0, skip_line2 * 32 * sizeof(coeff_t));
-  }
 }

 static void fastInverseDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
@ -2417,16 +2481,16 @@ DCT_MTS_NXN_GENERIC(DST1, 32);
 typedef void partial_tr_func(const int16_t*, int16_t*, int32_t, int, int, int);

 // ToDo: Enable MTS 2x2 and 64x64 transforms
-static partial_tr_func* dct_table[3][5] = {
-  { fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
-  { fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
-  { fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
+static partial_tr_func* dct_table[3][6] = {
+  { fastForwardDCT2_B2, fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
+  { NULL,               fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
+  { NULL,               fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
 };

-static partial_tr_func* idct_table[3][5] = {
-  { fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
-  { fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
-  { fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
+static partial_tr_func* idct_table[3][6] = {
+  { fastInverseDCT2_B2, fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
+  { NULL,               fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
+  { NULL,               fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
 };


@ -2436,11 +2500,12 @@ static const tr_type_t mts_subset_intra[4][2] = { { DST7, DST7 }, { DCT8, DST7 }

 void uvg_get_tr_type(
  int8_t width,
+  int8_t height,
  color_t color,
  const cu_info_t* tu,
  tr_type_t* hor_out,
  tr_type_t* ver_out,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
  *hor_out = DCT2;
  *ver_out = DCT2;
@ -2450,13 +2515,19 @@ void uvg_get_tr_type(
    return;
  }

-  const int height = width;
-  const bool explicit_mts = mts_idx == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_idx == UVG_MTS_INTRA : (mts_idx == UVG_MTS_INTER && tu->type == CU_INTER));
-  const bool implicit_mts = tu->type == CU_INTRA && (mts_idx == UVG_MTS_IMPLICIT || mts_idx == UVG_MTS_INTER);
+  const bool explicit_mts = mts_type == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : (mts_type == UVG_MTS_INTER && tu->type == CU_INTER));
+  const bool implicit_mts = tu->type == CU_INTRA && (mts_type == UVG_MTS_IMPLICIT || mts_type == UVG_MTS_INTER);

  assert(!(explicit_mts && implicit_mts));
+  const bool is_isp = tu->type == CU_INTRA && tu->intra.isp_mode && color == COLOR_Y ? tu->intra.isp_mode : 0;
+  const int8_t lfnst_idx = color == COLOR_Y ? tu->lfnst_idx : tu->cr_lfnst_idx;
+  // const bool is_sbt = cu->type == CU_INTER && tu->sbt && color == COLOR_Y; // TODO: check SBT here when implemented

-  if (implicit_mts)
+  if (is_isp && lfnst_idx) {
+    return;
+  }
+
+  if (implicit_mts || (is_isp && explicit_mts))
  {
    bool width_ok = width >= 4 && width <= 16;
    bool height_ok = height >= 4 && height <= 16;
@ -2472,6 +2543,10 @@ void uvg_get_tr_type(
    return;
  }

+  /*
+  TODO: SBT HANDLING
+  */
+
  if (explicit_mts)
  {
    if (tu->tr_idx > MTS_SKIP) {
@ -2487,27 +2562,31 @@ static void mts_dct_generic(
  const color_t color,
  const cu_info_t* tu,
  const int8_t width,
+  const int8_t height,
  const int16_t* input,
  int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
  tr_type_t type_hor;
  tr_type_t type_ver;

-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);

-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
  {
-    dct_func *dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
    dct_func(bitdepth, input, output);
  }
  else
  {
-    const int height = width;
    int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
    int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
-    if(tu->lfnst_idx || tu->cr_lfnst_idx) {
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
+    //const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    //const int log2_height_minus2 = uvg_g_convert_to_bit[height];
+
+    if((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
      if ((width == 4 && height > 4) || (width > 4 && height == 4))
      {
        skip_width = width - 4;
@ -2520,17 +2599,22 @@ static void mts_dct_generic(
      }
    }

-    partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2];
+    partial_tr_func* dct_hor = width != 1 ? dct_table[type_hor][log2_width_minus1] : NULL;
+    partial_tr_func* dct_ver = height != 1 ? dct_table[type_ver][log2_height_minus1] : NULL;

    int16_t tmp[32 * 32];
-    const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
-    const int32_t shift_2nd = log2_width_minus2 + 8;
-
+    const int32_t shift_1st = log2_width_minus1 + bitdepth - 8;
+    const int32_t shift_2nd = log2_height_minus1 + 7;
+    if (height == 1) {
+      dct_hor(input, output, shift_1st, height, 0, skip_width);
+    } else if (width == 1) {
+      dct_ver(input, output, log2_height_minus1 + 1 + bitdepth + 6 - 15, width, 0, skip_height);
+    } else {
      dct_hor(input, tmp, shift_1st, height, 0, skip_width);
      dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
    }    
  }
+}


 static void mts_idct_generic(
@ -2538,38 +2622,59 @@ static void mts_idct_generic(
  const color_t color,
  const cu_info_t* tu,
  const int8_t width,
+  const int8_t height,
  const int16_t* input,
  int16_t* output,
-  const int8_t mts_idx)
+  const int8_t mts_type)
 {
  tr_type_t type_hor;
  tr_type_t type_ver;

-  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
+  uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);

-  if (type_hor == DCT2 && type_ver == DCT2)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
  {
-    dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
+    dct_func *idct_func = uvg_get_idct_func(width, height, color, tu->type);
    idct_func(bitdepth, input, output);
  }
  else
  {
-    const int height = width;
-    const int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
-    const int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
-    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
+    int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
+    const int log2_width_minus1  = uvg_g_convert_to_log2[width] - 1;
+    const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;

-    partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2];
-    partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2];
+    if ((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
+      if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
+        skip_width = width - 4;
+        skip_height = height - 4;
+      }
+      else if ((width >= 8 && height >= 8)) {
+        skip_width = width - 8;
+        skip_height = height - 8;
+      }
+    }
+
+    partial_tr_func* idct_hor = width != 1 ? idct_table[type_hor][log2_width_minus1] : NULL;
+    partial_tr_func* idct_ver = height != 1 ? idct_table[type_ver][log2_height_minus1] : NULL;

    int16_t tmp[32 * 32];
-    const int32_t shift_1st = 7;
-    const int32_t shift_2nd = 20 - bitdepth;
+    const int max_log2_tr_dynamic_range = 15;
+    const int transform_matrix_shift = 6;

+    const int32_t shift_1st = transform_matrix_shift + 1;
+    const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth;
+
+    if (height == 1) {
+      idct_hor(input, output, shift_2nd + 1, height, 0, skip_width);
+    } else if (width == 1) {
+      idct_ver(input, output, shift_2nd + 1, width, 0, skip_height);
+    } else {
      idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
      idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
    }
  }
+}


 int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
@ -2582,6 +2687,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
  success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
  success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
  success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
+  //success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);

  success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);

--- a/src/strategies/generic/depquant-generic.c
+++ b/src/strategies/generic/depquant-generic.c
@ -0,0 +1,252 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/generic/depquant-generic.h"
+
+#include "dep_quant.h"
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "strategyselector.h"
+#include "transform.h"
+#include "uvg_math.h"
+#include "generic/quant-generic.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+  {32768,  65536,  98304,  131072, 163840, 196608, 262144, 262144,
+   327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216,
+   393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752,
+   458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+  {65536,  65536,  98304,  98304,  131072, 131072, 163840, 163840,
+   196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912,
+   360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448,
+   425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+  {98304,  98304,  98304,  98304,  131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144,
+   327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+  {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840,
+   196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision* decision, 
+  const all_depquant_states* const state,
+  int decision_id, 
+  int skip_offset) {
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block*               qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void uvg_dep_quant_decide_and_update_generic(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
+{
+  const int default_quant_coeff = dep_quant_context->m_quant->m_QScale;
+  const int32_t thres  = dep_quant_context->m_quant->m_thresLast;
+  int temp = *firstTestPos;
+  for (; temp >= 0; (temp)--) {
+    coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff));
+    if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
+      break;
+    }
+  }
+  *firstTestPos = temp;
+}
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+  
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic);
+  success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic);
+
+  return success;
+}
--- a/src/strategies/generic/depquant-generic.h
+++ b/src/strategies/generic/depquant-generic.h
@ -0,0 +1,50 @@
+#ifndef STRATEGIES_DEPQUANT_GENERIC_H_
+#define STRATEGIES_DEPQUANT_GENERIC_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "tables.h"
+
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_GENERIC_H_
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@ -54,11 +54,16 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
  cabac_data_t * const cabac,
  const coeff_t *coeff,
-  uint8_t width,
+  const cu_loc_t * const cu_loc,
  uint8_t color,
  int8_t scan_mode,
  cu_info_t* cur_cu,
-  double* bits_out) {
+  double* bits_out) 
+{
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  const int width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;

  //const encoder_control_t * const encoder = state->encoder_control;
  //int c1 = 1;
@ -75,12 +80,12 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,

  // CONSTANTS

-  const int height = width; // TODO: height for non-square blocks.
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width]+2;
-  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
-  const uint32_t *scan =
-    uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
+  const uint8_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint8_t log2_block_height = uvg_g_convert_to_log2[height];
+  
+  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
+  const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);


  // Init base contexts according to block type
@ -90,12 +95,13 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
  unsigned scan_cg_last = (unsigned)-1;
  unsigned scan_pos_last = (unsigned)-1;

-  for (int i = 0; i < width * width; i++) {
+  for (int i = 0; i < (width * height); ++i) {
    if (coeff[scan[i]]) {
      scan_pos_last = i;
      sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
    }
  }
+
  scan_cg_last = scan_pos_last >> log2_cg_size;

  int pos_last = scan[scan_pos_last];
@ -120,28 +126,33 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
    last_coeff_x,
    last_coeff_y,
    width,
-    width,
+    height,
    color,
    scan_mode,
    bits_out);



-  uint32_t quant_state_transition_table = 0; //ToDo: dep quant enable changes this
+  uint32_t quant_state_transition_table = state->encoder_control->cfg.dep_quant ? 32040 : 0; 
  int32_t quant_state = 0;
  uint8_t  ctx_offset[16];
  int32_t temp_diag = -1;
  int32_t temp_sum = -1;

-  int32_t reg_bins = (width*width * 28) >> 4; //8 for 2x2
+  int32_t reg_bins = (width * height * 28) >> 4; //8 for 2x2

  // significant_coeff_flag
  for (i = scan_cg_last; i >= 0; i--) {

    //int32_t abs_coeff[64*64];
+    const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
+    const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
+    const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
+    const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
    int32_t cg_blk_pos = scan_cg[i];
-    int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
-    int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
+    int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> log2_cg_width);
+    int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> log2_cg_width));
+

    // !!! residual_coding_subblock() !!!

@ -151,7 +162,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
    } else {
      uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0);
      uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-        cg_pos_y, (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
+        cg_pos_y, cg_width, cg_height);
      CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctx[ctx_sig], sig_coeff_group, bits, "significant_coeffgroup_flag");
    }

@ -182,7 +193,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,

        sig = (coeff[blk_pos] != 0) ? 1 : 0;
        if (num_non_zero || next_sig_pos != infer_sig_pos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
          cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]);
          cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]);

@ -190,7 +201,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
          reg_bins--;

        } else if (next_sig_pos != scan_pos_last) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
        }


@ -256,7 +267,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
        blk_pos = scan[scan_pos];
        pos_y = blk_pos / width;
        pos_x = blk_pos - (pos_y * width);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4);

        rice_param = g_go_rice_pars[abs_sum];
        uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]);
@ -274,7 +285,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
        pos_y = blk_pos / width;
        pos_x = blk_pos - (pos_y * width);
        uint32_t coeff_abs = abs(coeff[blk_pos]);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0);
        rice_param = g_go_rice_pars[abs_sum];        
        pos0 = ((quant_state<2)?1:2) << rice_param;
        uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs);
@ -291,7 +302,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,

      uint32_t num_signs = num_non_zero;

-      if (state->encoder_control->cfg.signhide_enable && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
+      if (state->encoder_control->cfg.signhide_enable && !state->encoder_control->cfg.dep_quant && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
        num_signs--;
        coeff_signs >>= 1;
      }
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@ -44,7 +44,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                  cabac_data_t * const cabac,
                                  const coeff_t *coeff,
-                                  uint8_t width,
+                                  const cu_loc_t * const loc,
                                  uint8_t color,
                                  int8_t scan_mode,
                                  cu_info_t* cur_cu,
--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@ -34,6 +34,7 @@

 #include <stdlib.h>

+#include "cu.h"
 #include "intra.h"
 #include "uvg266.h"
 #include "strategyselector.h"
@ -42,25 +43,32 @@

 /**
 * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
 * \param intra_mode    Angular mode in range 2..34.
+ * \param channel_type  Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
- * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=height*2+1.
 * \param dst           Buffer of size width*width.
 * \param multi_ref_idx Multi reference line index for use with MRL.
 */
 static void uvg_angular_pred_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  const int_fast8_t intra_mode,
  const int_fast8_t channel_type,
  const uvg_pixel *const in_ref_above,
  const uvg_pixel *const in_ref_left,
  uvg_pixel *const dst,
-  const uint8_t multi_ref_idx)
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode,
+  const int cu_dim)
 {
+  int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
  
-  assert(log2_width >= 2 && log2_width <= 5);
-  assert(intra_mode >= 2 && intra_mode <= 66);
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);
+  // assert(intra_mode >= 2 && intra_mode <= 66);

  static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
  static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp
@ -105,126 +113,105 @@ static void uvg_angular_pred_generic(
                                                    // Temporary buffer for modes 11-25.
                                                    // It only needs to be big enough to hold indices from -width to width-1.

+  uvg_pixel temp_dst[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+
  // TODO: check the correct size for these arrays when MRL is used
  //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;
+  uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
+  uvg_pixel temp_left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };

  uint32_t pred_mode = intra_mode; // ToDo: handle WAIP

  uint8_t multi_ref_index = multi_ref_idx;
+  uint8_t isp = isp_mode;

  // Whether to swap references to always project on the left reference row.
  const bool vertical_mode = intra_mode >= 34;
  // Modes distance to horizontal or vertical mode.
  const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -((int32_t)pred_mode - 18);
-  //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
  
  // Sample displacement per column in fractions of 32.
-  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
+  const int16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
  
-  // TODO: replace latter width with height
-  int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);
+  const int side_size = vertical_mode ? log2_height : log2_width;
+  int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]);

  // Pointer for the reference we are interpolating from.
  uvg_pixel *ref_main;
  // Pointer for the other reference.
  const uvg_pixel *ref_side;
+  uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst;
+  
+  const int top_ref_length  = isp_mode == ISP_MODE_VER ? width + cu_dim  : width << 1;
+  const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1;

  // Set ref_main and ref_side such that, when indexed with 0, they point to
  // index 0 in block coordinates.
  if (sample_disp < 0) {
+    memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel));

-    // TODO: for non-square blocks, separate loops for x and y is needed
-    for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
-      temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
+    ref_main = vertical_mode ? temp_above + height : temp_left + width;
+    ref_side = vertical_mode ? temp_left + width : temp_above + height;
+
+    int size_side = vertical_mode ? height : width;
+    for (int i = -size_side; i <= -1; i++) {
+      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)];
    }
-
-    // TODO: take into account non-square blocks
-    ref_main = temp_main + width;
-    ref_side = temp_side + width;
-
-    // TODO: for non square blocks, need to check if width or height is used for reference extension
-    for (int i = -width; i <= -1; i++) {
-      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)];
-    }
-
-    //const uint32_t index_offset = width + 1;
-    //const int32_t last_index = width;
-    //const int_fast32_t most_negative_index = (width * sample_disp) >> 5;
-    //// Negative sample_disp means, we need to use both references.
-
-    //// TODO: update refs to take into account variating block size and shapes
-    ////       (height is not always equal to width)
-    //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
-    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
-
-    //// Move the reference pixels to start from the middle to the later half of
-    //// the tmp_ref, so there is room for negative indices.
-    //for (int_fast32_t x = -1; x < width; ++x) {
-    //  tmp_ref[x + index_offset] = ref_main[x];
-    //}
-    //// Get a pointer to block index 0 in tmp_ref.
-    //ref_main = &tmp_ref[index_offset];
-    //tmp_ref[index_offset -1] = tmp_ref[index_offset];
-
-    //// Extend the side reference to the negative indices of main reference.
-    //int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
-    //int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
-    //// TODO: add 'vertical_mode ? height : width' instead of 'width'
-    //
-    //for (int_fast32_t x = -1; x > most_negative_index; x--) {
-    //  col_sample_disp += inv_abs_sample_disp;
-    //  int_fast32_t side_index = col_sample_disp >> 8;
-    //  tmp_ref[x + index_offset - 1] = ref_side[side_index - 1];
-    //}
-    //tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1];
-    //tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset];
  }
  else {
+    memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));

-    // TODO: again, separate loop needed for non-square blocks
-    for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
-      temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }
+    ref_main = vertical_mode ? temp_above : temp_left;
+    ref_side = vertical_mode ? temp_left : temp_above;

-    // TODO: this code block will need to change also when non-square blocks are used
-    // const int log2_ratio = 0;
-    const int s = 0;
+    const int log2_ratio = log2_width - log2_height;
+    const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
    const int max_index = (multi_ref_index << s) + 2;
-    const int ref_length = width << 1;
-    const uvg_pixel val = temp_main[ref_length + multi_ref_index];
+    int ref_length;
+    if (isp_mode) {
+      ref_length = vertical_mode ? top_ref_length : left_ref_length;
+    }
+    else {
+      ref_length = vertical_mode ? width << 1 : height << 1;
+    }
+    const uvg_pixel val = ref_main[ref_length + multi_ref_index];
    for (int j = 1; j <= max_index; j++) {
-      temp_main[ref_length + multi_ref_index +  j] = val;
+      ref_main[ref_length + multi_ref_index +  j] = val;
+    }
  }

-    ref_main = temp_main;
-    ref_side = temp_side;
-    //// sample_disp >= 0 means we don't need to refer to negative indices,
-    //// which means we can just use the references as is.
-    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
-    //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
-
-    //memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(uvg_pixel));
-    //ref_main = &tmp_ref[width];
-    //tmp_ref[width-1] = tmp_ref[width];
-    //int8_t last_index = 1 + width*2;
-    //tmp_ref[width + last_index] = tmp_ref[width + last_index - 1];
-  }

  // compensate for line offset in reference line buffers
  ref_main += multi_ref_index;
  ref_side += multi_ref_index;
+  if (!vertical_mode) { SWAP(width, height, int) }

  if (sample_disp != 0) {
+    bool use_cubic = true; // Default to cubic filter
+    static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
+    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
+    int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
+    if (dist_from_vert_or_hor > filter_threshold) {
+      if ((abs(sample_disp) & 0x1F) != 0)
+      {
+        use_cubic = false;
+      }
+    }
+    // Cubic must be used if ref line != 0 or if isp mode is != 0
+    if (multi_ref_index || isp) {
+      use_cubic = true;
+    }
    // The mode is not horizontal or vertical, we have to do interpolation.

-    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < width; ++y, delta_pos += sample_disp) {
+    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
+
      int_fast32_t delta_int = delta_pos >> 5;
      int_fast32_t delta_fract = delta_pos & (32 - 1);
+      const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
+      int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;

      if ((abs(sample_disp) & 0x1F) != 0) {
        
@ -232,25 +219,7 @@ static void uvg_angular_pred_generic(
        if (channel_type == 0) {
          int32_t ref_main_index = delta_int;
          uvg_pixel p[4];
-          bool use_cubic = true; // Default to cubic filter
-          static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
-          int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
-          int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
-          if (dist_from_vert_or_hor > filter_threshold) {
-            static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
-            const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
-            const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
-            if ((abs(sample_disp) & 0x1F) != 0)
-            {
-              use_cubic = false;
-            }
-          }
-          // Cubic must be used if ref line != 0
-          if (multi_ref_index) {
-            use_cubic = true;
-          }
-          const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
-          int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
+
          // Do 4-tap intra interpolation filtering
          for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
            p[0] = ref_main[ref_main_index];
@ -258,7 +227,7 @@ static void uvg_angular_pred_generic(
            p[2] = ref_main[ref_main_index + 2];
            p[3] = ref_main[ref_main_index + 3];
         
-            dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
+            work[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);

          }
        }
@ -268,26 +237,26 @@ static void uvg_angular_pred_generic(
          for (int_fast32_t x = 0; x < width; ++x) {
            uvg_pixel ref1 = ref_main[x + delta_int + 1];
            uvg_pixel ref2 = ref_main[x + delta_int + 2];
-            dst[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
+            work[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
          }
        }
      }
      else {
        // Just copy the integer samples
        for (int_fast32_t x = 0; x < width; x++) {
-          dst[y * width + x] = ref_main[x + delta_int + 1];
+          work[y * width + x] = ref_main[x + delta_int + 1];
        }
      }

     
      // PDPC
-      bool PDPC_filter = (width >= 4 || channel_type != 0);
+      bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0;
      if (pred_mode > 1 && pred_mode < 67) {
        if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
          PDPC_filter = false;
        }
        else if (mode_disp > 0) {
-          PDPC_filter = (scale >= 0);
+          PDPC_filter &= (scale >= 0);
        }
      }
      if(PDPC_filter) {
@ -297,70 +266,50 @@ static void uvg_angular_pred_generic(

          int wL = 32 >> (2 * x >> scale);
          const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1];
-          dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6);
+          work[y * width + x] = work[y * width + x] + ((wL * (left - work[y * width + x]) + 32) >> 6);
        }
      }
-
-        /*
-      if (pred_mode == 2 || pred_mode == 66) {
-        int wT = 16 >> MIN(31, ((y << 1) >> scale));
-        for (int x = 0; x < width; x++) {
-          int wL = 16 >> MIN(31, ((x << 1) >> scale));
-          if (wT + wL == 0) break;
-          int c = x + y + 1;
-          if (c >= 2 * width) { wL = 0; }
-          if (c >= 2 * width) { wT = 0; }
-          const uvg_pixel left = (wL != 0) ? ref_side[c] : 0;
-          const uvg_pixel top  = (wT != 0) ? ref_main[c] : 0;
-          dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6);
-        }
-      } else if (sample_disp == 0 || sample_disp >= 12) {
-        int inv_angle_sum_0 = 2;
-        for (int x = 0; x < width; x++) {
-          inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)];
-          int delta_pos_0 = inv_angle_sum_0 >> 2;
-          int delta_frac_0 = delta_pos_0 & 63;
-          int delta_int_0 = delta_pos_0 >> 6;
-          int delta_y = y + delta_int_0 + 1;
-          // TODO: convert to JVET_K0500_WAIP
-          if (delta_y > width + width - 1) break;
-
-          int wL = 32 >> MIN(31, ((x << 1) >> scale));
-          if (wL == 0) break;
-          const uvg_pixel *p = ref_side + delta_y - 1;
-          uvg_pixel left = p[delta_frac_0 >> 5];
-          dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6);
-        }
-      }*/
    }
  }
  else {
    // Mode is horizontal or vertical, just copy the pixels.
    
-    // TODO: update outer loop to use height instead of width
-    for (int_fast32_t y = 0; y < width; ++y) {
-      for (int_fast32_t x = 0; x < width; ++x) {
-        dst[y * width + x] = ref_main[x + 1];
-      }
    // Do not apply PDPC if multi ref line index is other than 0
-      if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
-        int scale = (log2_width + log2_width - 2) >> 2;
+    // TODO: do not do PDPC if block is in BDPCM mode
+    bool do_pdpc = ((width >= 4 && height >= 4) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
+
+    if (do_pdpc) {
+      int scale = (log2_width + log2_height - 2) >> 2;
      const uvg_pixel top_left = ref_main[0];
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
        const uvg_pixel left = ref_side[1 + y];
-        for (int i = 0; i < MIN(3 << scale, width); i++) {
-          const int wL = 32 >> (2 * i >> scale);
-          const uvg_pixel val = dst[y * width + i];
-          dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+        for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
+          const int wL = 32 >> (2 * x >> scale);
+          const uvg_pixel val = work[y * width + x];
+          work[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
        }
      }
+    } else {
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
+      }
    }
  }

  // Flip the block if this is was a horizontal mode.
  if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < width - 1; ++y) {
+    if(width == height) {
+      for (int_fast32_t y = 0; y < height - 1; ++y) {
        for (int_fast32_t x = y + 1; x < width; ++x) {
-        SWAP(dst[y * width + x], dst[x * width + y], uvg_pixel);
+          SWAP(work[y * height + x], work[x * width + y], uvg_pixel);
+        }
+      }
+    } else {
+      for(int y = 0; y < width; ++y) {
+        for(int x = 0; x < height; ++x) {
+          dst[x + y * height] = work[y + x * width];
+        }
      }
    }
  }
@ -369,23 +318,32 @@ static void uvg_angular_pred_generic(

 /**
 * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
 * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_intra_pred_planar_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
  const uvg_pixel *const ref_top,
  const uvg_pixel *const ref_left,
  uvg_pixel *const dst)
 {
-  // TODO: Add height
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];
+
+  const int offset = 1 << (log2_width + log2_height);
+  const int final_shift = 1 + log2_width + log2_height;
+  
+  // If ISP is enabled log_dim 1 is possible (limit was previously 2)
+  assert((log2_width >= 2 && log2_width <= 5) &&  log2_height <= 5);

-  const int_fast8_t width = 1 << log2_width;
  const uvg_pixel top_right = ref_top[width + 1];
-  const uvg_pixel bottom_left = ref_left[width + 1];
+  const uvg_pixel bottom_left = ref_left[height + 1];

 #if 0
  // Unoptimized version for reference.
@ -397,18 +355,27 @@ static void uvg_intra_pred_planar_generic(
    }
  }
 #else
-  int_fast16_t top[32];
+  // TODO: get rid of magic numbers. Make a define for this
+  int_fast16_t top[64];
+  int_fast16_t bottom[64];
+  int_fast16_t left[64];
+  int_fast16_t right[64];
  for (int i = 0; i < width; ++i) {
-    top[i] = ref_top[i + 1] << log2_width;
+    bottom[i] = bottom_left - ref_top[i + 1];
+    top[i] = ref_top[i + 1] << log2_height;
  }

-  for (int y = 0; y < width; ++y) {
-    int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
+  for (int j = 0; j < height; ++j) {
+    right[j] = top_right - ref_left[j + 1];
+    left[j] = ref_left[j + 1] << log2_width;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    int_fast16_t hor = left[y];
    for (int x = 0; x < width; ++x) {
-      hor += top_right - ref_left[y + 1];
-      top[x] += bottom_left - ref_top[x + 1];
-      dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
-      //
+      hor += right[y];
+      top[x] += bottom[x];
+      dst[y * width + x] = ((hor << log2_height) + (top[x] << log2_width) + offset) >> final_shift;
    }
  }
 #endif
@ -461,25 +428,26 @@ static void uvg_intra_pred_filtered_dc_generic(

 /**
 * \brief Position Dependent Prediction Combination for Planar and DC modes.
-* \param log2_width    Log2 of width, range 2..5.
-* \param width         Block width matching log2_width.
+* \param cu_loc        CU location and size data.
 * \param used_ref      Pointer used reference pixel struct.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_pdpc_planar_dc_generic(
  const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
  const uvg_intra_ref *const used_ref,
  uvg_pixel *const dst)
 {
  assert(mode == 0 || mode == 1);  // planar or DC
+  const int width =  color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width  = uvg_g_convert_to_log2[width];
+  const int log2_height = uvg_g_convert_to_log2[height];

-  // TODO: replace latter log2_width with log2_height
-  const int scale = ((log2_width - 2 + log2_width - 2 + 2) >> 2);
+  const int scale = (log2_width + log2_height - 2) >> 2;

-  // TODO: replace width with height
-  for (int y = 0; y < width; y++) {
+  for (int y = 0; y < height; y++) {
    int wT = 32 >> MIN(31, ((y << 1) >> scale));
    for (int x = 0; x < width; x++) {
      int wL = 32 >> MIN(31, ((x << 1) >> scale));
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@ -32,6 +32,7 @@

 #include "strategies/generic/picture-generic.h"

+#include <math.h>
 #include <stdlib.h>

 #include "strategies/strategies-picture.h"
@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel)

 SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)

+static uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  uint64_t satd = 0;
+  coeff_t diff[4], m[4];
+
+  diff[0] = piOrg[0] - piCur[0];
+  diff[1] = piOrg[1] - piCur[1];
+  diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur];
+  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
+  m[0] = diff[0] + diff[2];
+  m[1] = diff[1] + diff[3];
+  m[2] = diff[0] - diff[2];
+  m[3] = diff[1] - diff[3];
+
+  satd += abs(m[0] + m[1]) >> 2;
+  satd += abs(m[0] - m[1]);
+  satd += abs(m[2] + m[3]);
+  satd += abs(m[2] - m[3]);
+
+  return satd;
+}
+
+
+static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{   //need to add SIMD implementation ,JCA
+  int k, i, j, jj, sad = 0;
+  int diff[128], m1[8][16], m2[8][16];
+  for (k = 0; k < 128; k += 16)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    diff[k + 8] = piOrg[8] - piCur[8];
+    diff[k + 9] = piOrg[9] - piCur[9];
+    diff[k + 10] = piOrg[10] - piCur[10];
+    diff[k + 11] = piOrg[11] - piCur[11];
+    diff[k + 12] = piOrg[12] - piCur[12];
+    diff[k + 13] = piOrg[13] - piCur[13];
+    diff[k + 14] = piOrg[14] - piCur[14];
+    diff[k + 15] = piOrg[15] - piCur[15];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++)
+  {
+    jj = j << 4;
+
+    m2[j][0] = diff[jj] + diff[jj + 8];
+    m2[j][1] = diff[jj + 1] + diff[jj + 9];
+    m2[j][2] = diff[jj + 2] + diff[jj + 10];
+    m2[j][3] = diff[jj + 3] + diff[jj + 11];
+    m2[j][4] = diff[jj + 4] + diff[jj + 12];
+    m2[j][5] = diff[jj + 5] + diff[jj + 13];
+    m2[j][6] = diff[jj + 6] + diff[jj + 14];
+    m2[j][7] = diff[jj + 7] + diff[jj + 15];
+    m2[j][8] = diff[jj] - diff[jj + 8];
+    m2[j][9] = diff[jj + 1] - diff[jj + 9];
+    m2[j][10] = diff[jj + 2] - diff[jj + 10];
+    m2[j][11] = diff[jj + 3] - diff[jj + 11];
+    m2[j][12] = diff[jj + 4] - diff[jj + 12];
+    m2[j][13] = diff[jj + 5] - diff[jj + 13];
+    m2[j][14] = diff[jj + 6] - diff[jj + 14];
+    m2[j][15] = diff[jj + 7] - diff[jj + 15];
+
+    m1[j][0] = m2[j][0] + m2[j][4];
+    m1[j][1] = m2[j][1] + m2[j][5];
+    m1[j][2] = m2[j][2] + m2[j][6];
+    m1[j][3] = m2[j][3] + m2[j][7];
+    m1[j][4] = m2[j][0] - m2[j][4];
+    m1[j][5] = m2[j][1] - m2[j][5];
+    m1[j][6] = m2[j][2] - m2[j][6];
+    m1[j][7] = m2[j][3] - m2[j][7];
+    m1[j][8] = m2[j][8] + m2[j][12];
+    m1[j][9] = m2[j][9] + m2[j][13];
+    m1[j][10] = m2[j][10] + m2[j][14];
+    m1[j][11] = m2[j][11] + m2[j][15];
+    m1[j][12] = m2[j][8] - m2[j][12];
+    m1[j][13] = m2[j][9] - m2[j][13];
+    m1[j][14] = m2[j][10] - m2[j][14];
+    m1[j][15] = m2[j][11] - m2[j][15];
+
+    m2[j][0] = m1[j][0] + m1[j][2];
+    m2[j][1] = m1[j][1] + m1[j][3];
+    m2[j][2] = m1[j][0] - m1[j][2];
+    m2[j][3] = m1[j][1] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][6];
+    m2[j][5] = m1[j][5] + m1[j][7];
+    m2[j][6] = m1[j][4] - m1[j][6];
+    m2[j][7] = m1[j][5] - m1[j][7];
+    m2[j][8] = m1[j][8] + m1[j][10];
+    m2[j][9] = m1[j][9] + m1[j][11];
+    m2[j][10] = m1[j][8] - m1[j][10];
+    m2[j][11] = m1[j][9] - m1[j][11];
+    m2[j][12] = m1[j][12] + m1[j][14];
+    m2[j][13] = m1[j][13] + m1[j][15];
+    m2[j][14] = m1[j][12] - m1[j][14];
+    m2[j][15] = m1[j][13] - m1[j][15];
+
+    m1[j][0] = m2[j][0] + m2[j][1];
+    m1[j][1] = m2[j][0] - m2[j][1];
+    m1[j][2] = m2[j][2] + m2[j][3];
+    m1[j][3] = m2[j][2] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][5];
+    m1[j][5] = m2[j][4] - m2[j][5];
+    m1[j][6] = m2[j][6] + m2[j][7];
+    m1[j][7] = m2[j][6] - m2[j][7];
+    m1[j][8] = m2[j][8] + m2[j][9];
+    m1[j][9] = m2[j][8] - m2[j][9];
+    m1[j][10] = m2[j][10] + m2[j][11];
+    m1[j][11] = m2[j][10] - m2[j][11];
+    m1[j][12] = m2[j][12] + m2[j][13];
+    m1[j][13] = m2[j][12] - m2[j][13];
+    m1[j][14] = m2[j][14] + m2[j][15];
+    m1[j][15] = m2[j][14] - m2[j][15];
+  }
+
+  //vertical
+  for (i = 0; i < 16; i++)
+  {
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 16; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(16.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[128], m1[16][8], m2[16][8];
+  for (k = 0; k < 128; k += 8)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 16; j++)
+  {
+    jj = j << 3;
+
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++)
+  {
+    m1[0][i] = m2[0][i] + m2[8][i];
+    m1[1][i] = m2[1][i] + m2[9][i];
+    m1[2][i] = m2[2][i] + m2[10][i];
+    m1[3][i] = m2[3][i] + m2[11][i];
+    m1[4][i] = m2[4][i] + m2[12][i];
+    m1[5][i] = m2[5][i] + m2[13][i];
+    m1[6][i] = m2[6][i] + m2[14][i];
+    m1[7][i] = m2[7][i] + m2[15][i];
+    m1[8][i] = m2[0][i] - m2[8][i];
+    m1[9][i] = m2[1][i] - m2[9][i];
+    m1[10][i] = m2[2][i] - m2[10][i];
+    m1[11][i] = m2[3][i] - m2[11][i];
+    m1[12][i] = m2[4][i] - m2[12][i];
+    m1[13][i] = m2[5][i] - m2[13][i];
+    m1[14][i] = m2[6][i] - m2[14][i];
+    m1[15][i] = m2[7][i] - m2[15][i];
+
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+    m2[8][i] = m1[8][i] + m1[12][i];
+    m2[9][i] = m1[9][i] + m1[13][i];
+    m2[10][i] = m1[10][i] + m1[14][i];
+    m2[11][i] = m1[11][i] + m1[15][i];
+    m2[12][i] = m1[8][i] - m1[12][i];
+    m2[13][i] = m1[9][i] - m1[13][i];
+    m2[14][i] = m1[10][i] - m1[14][i];
+    m2[15][i] = m1[11][i] - m1[15][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+    m1[8][i] = m2[8][i] + m2[10][i];
+    m1[9][i] = m2[9][i] + m2[11][i];
+    m1[10][i] = m2[8][i] - m2[10][i];
+    m1[11][i] = m2[9][i] - m2[11][i];
+    m1[12][i] = m2[12][i] + m2[14][i];
+    m1[13][i] = m2[13][i] + m2[15][i];
+    m1[14][i] = m2[12][i] - m2[14][i];
+    m1[15][i] = m2[13][i] - m2[15][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+    m2[8][i] = m1[8][i] + m1[9][i];
+    m2[9][i] = m1[8][i] - m1[9][i];
+    m2[10][i] = m1[10][i] + m1[11][i];
+    m2[11][i] = m1[10][i] - m1[11][i];
+    m2[12][i] = m1[12][i] + m1[13][i];
+    m2[13][i] = m1[12][i] - m1[13][i];
+    m2[14][i] = m1[14][i] + m1[15][i];
+    m2[15][i] = m1[14][i] - m1[15][i];
+  }
+
+  for (i = 0; i < 16; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(16.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[32], m1[8][4], m2[8][4];
+  for (k = 0; k < 32; k += 4)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++)
+  {
+    jj = j << 2;
+    m2[j][0] = diff[jj] + diff[jj + 2];
+    m2[j][1] = diff[jj + 1] + diff[jj + 3];
+    m2[j][2] = diff[jj] - diff[jj + 2];
+    m2[j][3] = diff[jj + 1] - diff[jj + 3];
+
+    m1[j][0] = m2[j][0] + m2[j][1];
+    m1[j][1] = m2[j][0] - m2[j][1];
+    m1[j][2] = m2[j][2] + m2[j][3];
+    m1[j][3] = m2[j][2] - m2[j][3];
+  }
+
+  //vertical
+  for (i = 0; i < 4; i++)
+  {
+    m2[0][i] = m1[0][i] + m1[4][i];
+    m2[1][i] = m1[1][i] + m1[5][i];
+    m2[2][i] = m1[2][i] + m1[6][i];
+    m2[3][i] = m1[3][i] + m1[7][i];
+    m2[4][i] = m1[0][i] - m1[4][i];
+    m2[5][i] = m1[1][i] - m1[5][i];
+    m2[6][i] = m1[2][i] - m1[6][i];
+    m2[7][i] = m1[3][i] - m1[7][i];
+
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+    m1[4][i] = m2[4][i] + m2[6][i];
+    m1[5][i] = m2[5][i] + m2[7][i];
+    m1[6][i] = m2[4][i] - m2[6][i];
+    m1[7][i] = m2[5][i] - m2[7][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+    m2[4][i] = m1[4][i] + m1[5][i];
+    m2[5][i] = m1[4][i] - m1[5][i];
+    m2[6][i] = m1[6][i] + m1[7][i];
+    m2[7][i] = m1[6][i] - m1[7][i];
+  }
+
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 4; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(4.0 * 8) * 2);
+
+  return sad;
+}
+
+static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
+{
+  int k, i, j, jj, sad = 0;
+  int diff[32], m1[4][8], m2[4][8];
+  for (k = 0; k < 32; k += 8)
+  {
+    diff[k + 0] = piOrg[0] - piCur[0];
+    diff[k + 1] = piOrg[1] - piCur[1];
+    diff[k + 2] = piOrg[2] - piCur[2];
+    diff[k + 3] = piOrg[3] - piCur[3];
+    diff[k + 4] = piOrg[4] - piCur[4];
+    diff[k + 5] = piOrg[5] - piCur[5];
+    diff[k + 6] = piOrg[6] - piCur[6];
+    diff[k + 7] = piOrg[7] - piCur[7];
+
+    piCur += iStrideCur;
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 4; j++)
+  {
+    jj = j << 3;
+
+    m2[j][0] = diff[jj] + diff[jj + 4];
+    m2[j][1] = diff[jj + 1] + diff[jj + 5];
+    m2[j][2] = diff[jj + 2] + diff[jj + 6];
+    m2[j][3] = diff[jj + 3] + diff[jj + 7];
+    m2[j][4] = diff[jj] - diff[jj + 4];
+    m2[j][5] = diff[jj + 1] - diff[jj + 5];
+    m2[j][6] = diff[jj + 2] - diff[jj + 6];
+    m2[j][7] = diff[jj + 3] - diff[jj + 7];
+
+    m1[j][0] = m2[j][0] + m2[j][2];
+    m1[j][1] = m2[j][1] + m2[j][3];
+    m1[j][2] = m2[j][0] - m2[j][2];
+    m1[j][3] = m2[j][1] - m2[j][3];
+    m1[j][4] = m2[j][4] + m2[j][6];
+    m1[j][5] = m2[j][5] + m2[j][7];
+    m1[j][6] = m2[j][4] - m2[j][6];
+    m1[j][7] = m2[j][5] - m2[j][7];
+
+    m2[j][0] = m1[j][0] + m1[j][1];
+    m2[j][1] = m1[j][0] - m1[j][1];
+    m2[j][2] = m1[j][2] + m1[j][3];
+    m2[j][3] = m1[j][2] - m1[j][3];
+    m2[j][4] = m1[j][4] + m1[j][5];
+    m2[j][5] = m1[j][4] - m1[j][5];
+    m2[j][6] = m1[j][6] + m1[j][7];
+    m2[j][7] = m1[j][6] - m1[j][7];
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++)
+  {
+    m1[0][i] = m2[0][i] + m2[2][i];
+    m1[1][i] = m2[1][i] + m2[3][i];
+    m1[2][i] = m2[0][i] - m2[2][i];
+    m1[3][i] = m2[1][i] - m2[3][i];
+
+    m2[0][i] = m1[0][i] + m1[1][i];
+    m2[1][i] = m1[0][i] - m1[1][i];
+    m2[2][i] = m1[2][i] + m1[3][i];
+    m2[3][i] = m1[2][i] - m1[3][i];
+  }
+
+  for (i = 0; i < 4; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      sad += abs(m2[i][j]);
+    }
+  }
+
+  sad -= abs(m2[0][0]);
+  sad += abs(m2[0][0]) >> 2;
+  sad = (int)(sad / sqrt(4.0 * 8) * 2);
+
+  return sad;
+}
+
+
+static uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
+{
+  const uvg_pixel* piOrg = ref_in;
+  const uvg_pixel* piCur = pred_in;
+  const int  iRows = height;
+  const int  iCols = width;
+  const int  iStrideOrg = ref_stride;
+  const int  iStrideCur = pred_stride;
+
+  int  x = 0, y = 0;
+
+  uint64_t uiSum = 0;
+
+  if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0)
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 16)
+      {
+        uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 8;
+      piCur += iStrideCur * 8;
+    }
+  }
+  else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0)
+  {
+    for (y = 0; y < iRows; y += 16)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 16;
+      piCur += iStrideCur * 16;
+    }
+  }
+  else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0)
+  {
+    for (y = 0; y < iRows; y += 4)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 4;
+      piCur += iStrideCur * 4;
+    }
+  }
+  else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0)
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 4)
+      {
+        uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += iStrideOrg * 8;
+      piCur += iStrideCur * 8;
+    }
+  }
+  else if ((iRows % 8 == 0) && (iCols % 8 == 0))
+  {
+    for (y = 0; y < iRows; y += 8)
+    {
+      for (x = 0; x < iCols; x += 8)
+      {
+        uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
+      }
+      piOrg += 8 * iStrideOrg;
+      piCur += 8 * iStrideCur;
+    }
+  }
+  else if ((iRows % 4 == 0) && (iCols % 4 == 0))
+  {
+    for (y = 0; y < iRows; y += 4)
+    {
+      for (x = 0; x < iCols; x += 4)
+      {
+        uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
+      }
+      piOrg += 4 * iStrideOrg;
+      piCur += 4 * iStrideCur;
+    }
+  }
+  else if ((iRows % 2 == 0) && (iCols % 2 == 0))
+  {
+    for (y = 0; y < iRows; y += 2)
+    {
+      for (x = 0; x < iCols; x += 2)
+      {
+        uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
+      }
+      piOrg += 2 * iStrideOrg;
+      piCur += 2 * iStrideCur;
+    }
+  }
+
+  // TODO: 10 bit
+  return (uiSum >> 0);
+}
+
+
 // Function macro for defining SAD calculating functions
 // for fixed size blocks.
 #define SAD_NXN(n, pixel_type) \
@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel)

 static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec,
                 const int ref_stride, const int rec_stride,
-                 const int width)
+                 const int width, const int height)
 {
  int ssd = 0;
  int y, x;

-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
      ssd += diff * diff;
@ -783,10 +1355,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)


 static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, 
-  int width, int ref_stride, int pred_stride)
+  int width, int height, int ref_stride, int pred_stride)
 {
  int y, x;
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
    }
@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
  success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
  success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
  success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
+  success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs);
  success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);

  success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -44,7 +44,6 @@
 #include "fast_coeff_cost.h"
 #include "reshape.h"

-#define QUANT_SHIFT 14
 /**
 * \brief quantize transformed coefficents
 *
@ -62,22 +61,28 @@ void uvg_quant_generic(
  uint8_t lfnst_idx)
 {
  const encoder_control_t * const encoder = state->encoder_control;
-  const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);

  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  uint32_t log2_tr_width = uvg_math_floor_log2(height);
-  uint32_t log2_tr_height = uvg_math_floor_log2(width);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size
+    
  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
  const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
-  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
+  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift );
  const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
  const int32_t q_bits8 = q_bits - 8;

+  const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
+
  uint32_t ac_sum = 0;

+  const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
+
  if(lfnst_idx == 0){
    for (int32_t n = 0; n < width * height; n++) {
      int32_t level = coef[n];
@ -86,7 +91,7 @@ void uvg_quant_generic(

      sign = (level < 0 ? -1 : 1);

-      int32_t curr_quant_coeff = quant_coeff[n];
+      int32_t curr_quant_coeff = use_scaling_list ? quant_coeff[n] : default_quant_coeff;
      level = (int32_t)((abs_level * curr_quant_coeff + add) >> q_bits);
      ac_sum += level;

@ -237,6 +242,7 @@ int uvg_quant_cbcr_residual_generic(
  encoder_state_t* const state, 
  const cu_info_t* const cur_cu,
  const int width,
+  const int height,
  const coeff_scan_order_t scan_order,
  const int in_stride, const int out_stride,
  const uvg_pixel* const u_ref_in, 
@ -247,28 +253,28 @@ int uvg_quant_cbcr_residual_generic(
  uvg_pixel* v_rec_out,
  coeff_t* coeff_out,
  bool early_skip, 
-  int lmcs_chroma_adj, enum uvg_tree_type tree_type
-  ) {
+  int lmcs_chroma_adj, enum uvg_tree_type tree_type) 
+{
  ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
+  // TODO: this function is not fully converted to handle non-square blocks
  {
    int y, x;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
        v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
      }
    }
  }
-  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
-  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
+  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride);
+  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride);
  
  
  const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
-  for (int y = 0; y < width; y++)
+  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
@ -305,33 +311,44 @@ int uvg_quant_cbcr_residual_generic(
  }


-  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
-  if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+  uint8_t lfnst_idx = tree_type == UVG_CHROMA_T ? cur_cu->cr_lfnst_idx : cur_cu->lfnst_idx;
+  if(lfnst_idx) {
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
  }
-
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if (!false && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      COLOR_U,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
    (width > 4 || !state->encoder_control->cfg.rdoq_skip))
  {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      cur_cu->cr_lfnst_idx);
+    uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx, 0);
  }
  else if (state->encoder_control->cfg.rdoq_enable && false) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
      scan_order);
  }
  else {
-    uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->lfnst_idx);
+    uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, lfnst_idx);
  }

  int8_t has_coeffs = 0;
  {
    int i;
-    for (i = 0; i < width * width; ++i) {
+    for (i = 0; i < width * height; ++i) {
      if (coeff_out[i] != 0) {
        has_coeffs = 1;
        break;
@ -342,13 +359,13 @@ int uvg_quant_cbcr_residual_generic(
  if (has_coeffs && !early_skip) {

    // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
+    uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
-    if (cur_cu->cr_lfnst_idx) {
-      uvg_inv_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+    if (lfnst_idx) {
+      uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
    }
    
-    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
    

    //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@ -371,7 +388,7 @@ int uvg_quant_cbcr_residual_generic(
    //}
    const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
    // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    for (int y = 0; y < width; y++) {
+    for (int y = 0; y < height; y++) {
      for (int x = 0; x < width; x++) {
        if (temp == 2) {
          u_residual[x + y * width] = combined_residual[x + y * width];
@ -400,7 +417,7 @@ int uvg_quant_cbcr_residual_generic(
        }
      }
    }
-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
      for (int x = 0; x < width; ++x) {
        int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
        u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val);
@ -413,7 +430,7 @@ int uvg_quant_cbcr_residual_generic(
    // With no coeffs and rec_out == pred_int we skip copying the coefficients
    // because the reconstruction is just the prediction.

-    for (int y = 0; y < width; ++y) {
+    for (int y = 0; y < height; ++y) {
      for (int x = 0; x < width; ++x) {
        u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
        v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
@ -441,7 +458,7 @@ int uvg_quant_cbcr_residual_generic(
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@ -454,19 +471,19 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,

  int has_coeffs = 0;

-  assert(width <= TR_MAX_WIDTH);
-  assert(width >= TR_MIN_WIDTH);
-
-  const int height = width; // TODO: height for non-square blocks
+  // With ISP these checks no longer apply, since width and height 2 is now possible
+  // With MTT even 1x16 and 16x1 ISP splits are possible
+  //assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
+  //assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);

  // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
    int y, x;
    int sign, absval;
    int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        sign = residual[x + y * width] >= 0 ? 1 : -1;
        absval = sign * residual[x + y * width];
@ -477,43 +494,54 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,

  // Transform residual. (residual -> coeff)
  if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
  }
  else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
  }

-  const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
+  const uint8_t lfnst_index = tree_type != UVG_CHROMA_T || color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;

  if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
    // Forward low frequency non-separable transform
-    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
  }
  

  // Quantize coeffs. (coeff -> coeff_out)
  
-  if (state->encoder_control->cfg.rdoq_enable &&
+  int abs_sum = 0;
+  if (!use_trskip && state->encoder_control->cfg.dep_quant) {
+    uvg_dep_quant(
+      state,
+      cur_cu,
+      width,
+      height,
+      coeff,
+      coeff_out,
+      color,
+      tree_type,
+      &abs_sum,
+      state->encoder_control->cfg.scaling_list);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable &&
      (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
  {
-    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
-    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, color,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      lfnst_index);
+    uvg_rdoq(state, coeff, coeff_out, width, height, color,
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
  } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
      scan_order);
  } else {
  
-    uvg_quant(state, coeff, coeff_out, width, width, color,
+    uvg_quant(state, coeff, coeff_out, width, height, color,
      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
  }

  // Check if there are any non-zero coefficients.
  {
    int i;
-    for (i = 0; i < width * width; ++i) {
+    for (i = 0; i < width * height; ++i) {
      if (coeff_out[i] != 0) {
        has_coeffs = 1;
        break;
@ -527,25 +555,25 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
    int y, x;

    // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, color,
+    uvg_dequant(state, coeff_out, coeff, width, height, color,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
    
    if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
      // Inverse low frequency non-separable transform
-      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
+      uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
    }
    if (use_trskip) {
-      uvg_itransformskip(state->encoder_control, residual, coeff, width);
+      uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
    }
    else {
-      uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+      uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
    }
    
    if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
      int y, x;
      int sign, absval;
      int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-      for (y = 0; y < width; ++y) {
+      for (y = 0; y < height; ++y) {
        for (x = 0; x < width; ++x) {
          residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
          sign = residual[x + y * width] >= 0 ? 1 : -1;
@ -561,7 +589,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
    }

    // Get quantized reconstruction. (residual + pred_in -> rec_out)
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
        rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
@ -573,7 +601,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
    // because the reconstruction is just the prediction.
    int y, x;

-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
      }
@ -590,23 +618,29 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
  const encoder_control_t * const encoder = state->encoder_control;
+  if(encoder->cfg.dep_quant && !transform_skip) {
+    uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
+    return;
+  }
  int32_t shift,add,coeff_q;
  int32_t n;
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
+  const uint32_t log2_tr_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform

+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 0; // Non log2 block size

  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;

-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);

  if (encoder->scaling_list.enable)
  {
-    uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
-    uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);

-    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6];
+    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
    shift += 4;

    if (shift >qp_scaled / 6) {
@ -624,7 +658,7 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
      }
    }
  } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
    add = 1 << (shift-1);

    for (n = 0; n < width * height; n++) {
@ -651,14 +685,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
  weights[3] = (wts_packed >> 48) & 0xffff;
 }

-static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
  uint32_t sum = 0;
  uint16_t weights_unpacked[4];

  get_coeff_weights(weights, weights_unpacked);

-  for (int32_t i = 0; i < width * width; i++) {
+  for (int32_t i = 0; i < width * height; i++) {
     int16_t curr = coeff[i];
    uint32_t curr_abs = abs(curr);
    if (curr_abs > 3) {
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@ -44,8 +44,6 @@
 #include "uvg266.h"
 #include "tables.h"

-#define QUANT_SHIFT 14
-
 int uvg_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
 void uvg_quant_generic(
  const encoder_state_t * const state,
@ -60,7 +58,7 @@ void uvg_quant_generic(
  uint8_t lfnst_idx);

 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@ -71,6 +69,7 @@ int uvg_quant_cbcr_residual_generic(
  encoder_state_t* const state,
  const cu_info_t* const cur_cu,
  const int width,
+  const int height,
  const coeff_scan_order_t scan_order,
  const int in_stride, const int out_stride,
  const uvg_pixel* const u_ref_in,
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0;
 dct_func * uvg_dct_8x8 = 0;
 dct_func * uvg_dct_16x16 = 0;
 dct_func * uvg_dct_32x32 = 0;
+dct_func * uvg_dct_non_square = 0;

 dct_func * uvg_fast_inverse_dst_4x4 = 0;

@ -56,16 +57,19 @@ void(*uvg_mts_dct)(int8_t bitdepth,
  color_t color,
  const cu_info_t *tu,
  int8_t width,
+  int8_t height,
  const int16_t *input,
  int16_t *output,
-  const int8_t mts_idx);
+  const int8_t mts_type);
+
 void(*uvg_mts_idct)(int8_t bitdepth,
  color_t color,
  const cu_info_t *tu,
  int8_t width,
+  int8_t height,
  const int16_t *input,
  int16_t *output,
-  const int8_t mts_idx);
+  const int8_t mts_type);


 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
@ -90,8 +94,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
 *
 * \returns Pointer to the function.
 */
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
+    //return uvg_dct_non_square;
+  }
  switch (width) {
  case 4:
    //if (color == COLOR_Y && type == CU_INTRA) {
@ -119,8 +128,13 @@ dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
 *
 * \returns Pointer to the function.
 */
-dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_idct function.");
+    //return uvg_idct_non_square;
+  }
  switch (width) {
  case 4:
    //if (color == COLOR_Y && type == CU_INTRA) {
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4;
 extern dct_func * uvg_dct_8x8;
 extern dct_func * uvg_dct_16x16;
 extern dct_func * uvg_dct_32x32;
+extern dct_func * uvg_dct_non_square;

 extern dct_func * uvg_fast_inverse_dst_4x4;

@ -64,9 +65,10 @@ typedef void (mts_dct_func)(
  color_t color,
  const cu_info_t* tu,
  int8_t width,
+  int8_t height,
  const int16_t* input,
  int16_t* output,
-  const int8_t mts_idx);
+  const int8_t mts_type);

 extern mts_dct_func* uvg_mts_dct;

@ -75,15 +77,16 @@ typedef void (mts_idct_func)(
  color_t color,
  const cu_info_t* tu,
  int8_t width,
+  int8_t height,
  const int16_t* input,
  int16_t* output,
-  const int8_t mts_idx);
+  const int8_t mts_type);

 extern mts_idct_func* uvg_mts_idct;

 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type);
-dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
+dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type);



--- a/src/strategies/strategies-depquant.c
+++ b/src/strategies/strategies-depquant.c
@ -0,0 +1,55 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/strategies-depquant.h"
+
+#include "strategies/avx2/depquant-avx2.h"
+#include "strategies/generic/depquant-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
+
+
+int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+  success &= uvg_strategy_register_depquant_generic(opaque, bitdepth);
+
+  if (uvg_g_hardware_flags.intel_flags.avx2) {
+    success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth);
+  }
+  return success;
+}
--- a/src/strategies/strategies-depquant.h
+++ b/src/strategies/strategies-depquant.h
@ -0,0 +1,88 @@
+#ifndef STRATEGIES_DEPQUANT_H_
+#define STRATEGIES_DEPQUANT_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "dep_quant.h"
+
+
+// Declare function pointers.
+typedef int(dep_quant_decide_and_update_func)(
+  rate_estimator_t*                       re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma);
+
+typedef void (find_first_non_zero_coeff_func)(
+  const coeff_t*             srcCoeff,
+  const bool                 enableScalingLists,
+  const context_store* const dep_quant_context,
+  const uint32_t* const      scan,
+  const int32_t*             q_coeff,
+  int*                       firstTestPos,
+  int                        width,
+  int                        height);
+
+
+// Declare function pointers.
+extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
+
+int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_DEPQUANT_EXPORTS \
+  {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
+  {"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \
+
+
+
+#endif //STRATEGIES_DEPQUANT_H_
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@ -49,7 +49,7 @@
 typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                         cabac_data_t * const cabac,
                                         const coeff_t *coeff,
-                                         uint8_t width,
+                                         const cu_loc_t * const loc,
                                         uint8_t color,
                                         int8_t scan_mode,
                                         cu_info_t* cur_cu,
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@ -38,22 +38,26 @@
 * Interface for intra prediction functions.
 */

+#include "cu.h"
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 #include "uvg266.h"


 typedef void (angular_pred_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  const int_fast8_t intra_mode,
  const int_fast8_t channel_type,
  const uvg_pixel *const in_ref_above,
  const uvg_pixel *const in_ref_left,
  uvg_pixel *const dst,
-  const uint8_t multi_ref_idx);
+  const uint8_t multi_ref_idx,
+  const uint8_t isp_mode,
+  const int cu_dim);

 typedef void (intra_pred_planar_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
  const uvg_pixel *const ref_top,
  const uvg_pixel *const ref_left,
  uvg_pixel *const dst);
@ -67,8 +71,8 @@ typedef void (intra_pred_filtered_dc_func)(

 typedef void (pdpc_planar_dc_func)(
  const int mode,
-  const int width,
-  const int log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
  const uvg_intra_ref *const used_ref,
  uvg_pixel *const dst);

--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@ -37,6 +37,7 @@
 #include "strategies/generic/picture-generic.h"
 #include "strategies/sse2/picture-sse2.h"
 #include "strategies/sse41/picture-sse41.h"
+#include "strategies/sse42/picture-sse42.h"
 #include "strategyselector.h"


@ -70,6 +71,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0;
 cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0;

 cost_pixel_any_size_func * uvg_satd_any_size = 0;
+cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0;
 cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0;

 pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0;
@ -115,13 +117,14 @@ int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) {
 /**
 * \brief  Get a function that calculates SATD for NxN block.
 *
-* \param n  Width of the region for which SATD is calculated.
+* \param width  Width of the region for which SATD is calculated.
 *
 * \returns  Pointer to cost_16bit_nxn_func.
 */
-cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n)
+cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned width, unsigned height)
 {
-  switch (n) {
+  if(width == height) {
+    switch (width) {
      case 4:
        return uvg_satd_4x4;
      case 8:
@ -136,18 +139,21 @@ cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n)
        return NULL;
    }
  }
+  return NULL;
+}


 /**
 * \brief  Get a function that calculates SAD for NxN block.
 *
-* \param n  Width of the region for which SAD is calculated.
+* \param width  Width of the region for which SAD is calculated.
 *
 * \returns  Pointer to cost_16bit_nxn_func.
 */
-cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n)
+cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned width, unsigned height)
 {
-  switch (n) {
+  if(width == height) {
+    switch (width) {
      case 4:
        return uvg_sad_4x4;
      case 8:
@ -162,17 +168,21 @@ cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n)
        return NULL;
    }
  }
+  return NULL;
+}

 /**
 * \brief  Get a function that calculates SATDs for 2 NxN blocks.
 *
-* \param n  Width of the region for which SATD is calculated.
+* \param width  Width of the region for which SATD is calculated.
+* \param height  Height of the region for which SATD is calculated.
 *
 * \returns  Pointer to cost_pixel_nxn_multi_func.
 */
-cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n)
+cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height)
 {
-  switch (n) {
+  if(width == height) {
+    switch (width) {
      case 4:
        return uvg_satd_4x4_dual;
      case 8:
@ -187,18 +197,21 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n)
        return NULL;
    }
  }
+  return NULL;
+}


 /**
 * \brief  Get a function that calculates SADs for 2 NxN blocks.
 *
-* \param n  Width of the region for which SAD is calculated.
+* \param width  Width of the region for which SAD is calculated.
 *
 * \returns  Pointer to cost_pixel_nxn_multi_func.
 */
-cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n)
+cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height)
 {
-  switch (n) {
+  if(width == height) {
+    switch (width) {
      case 4:
        return uvg_sad_4x4_dual;
      case 8:
@ -213,6 +226,8 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n)
        return NULL;
    }
  }
+  return NULL;
+}

 // Precomputed CRC32C lookup table for polynomial 0x04C11DB7
 const uint32_t uvg_crc_table[256] = {
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)(
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out);
 typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);

-typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
+typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height);
 typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
 typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
                                int32_t block_width, int32_t block_height,
@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,

 typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);

-typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
+typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride);


 extern const uint32_t uvg_crc_table[256];
@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16;
 extern cost_pixel_nxn_func * uvg_satd_32x32;
 extern cost_pixel_nxn_func * uvg_satd_64x64;
 extern cost_pixel_any_size_func *uvg_satd_any_size;
+extern cost_pixel_any_size_func *uvg_satd_any_size_vtm;

 extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual;
 extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual;
@ -203,8 +204,8 @@ extern pixel_var_func *uvg_pixel_var;
 extern generate_residual_func* uvg_generate_residual;

 int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth);
-cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n);
-cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
+cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height);
+cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height);

 #define STRATEGIES_PICTURE_EXPORTS \
  {"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \
@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
  {"satd_32x32", (void**) &uvg_satd_32x32}, \
  {"satd_64x64", (void**) &uvg_satd_64x64}, \
  {"satd_any_size", (void**) &uvg_satd_any_size}, \
+  {"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \
  {"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \
  {"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \
  {"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \
--- a/src/strategies/strategies-quant.c
+++ b/src/strategies/strategies-quant.c
@ -46,7 +46,8 @@ coeff_abs_sum_func *uvg_coeff_abs_sum;
 fast_coeff_cost_func *uvg_fast_coeff_cost;


-int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) {
+int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth)
+{
  bool success = true;

  success &= uvg_strategy_register_quant_generic(opaque, bitdepth);
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@ -45,12 +45,23 @@
 #include "tables.h"

 // Declare function pointers.
-typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx);
+typedef unsigned (quant_func)(
+  const encoder_state_t * const state, 
+  coeff_t *coef, 
+  coeff_t *q_coef, 
+  int32_t width,
+  int32_t height, 
+  color_t color, 
+  int8_t scan_idx, 
+  int8_t block_type, 
+  int8_t transform_skip, 
+  uint8_t lfnst_idx);
+
 typedef unsigned (quant_cbcr_func)(
  encoder_state_t* const state,
  const cu_info_t* const cur_cu,
  const int width,
+  const int height,
  const coeff_scan_order_t scan_order,
  const int in_stride, const int out_stride,
  const uvg_pixel* const u_ref_in,
@ -63,16 +74,19 @@ typedef unsigned (quant_cbcr_func)(
  bool early_skip,
  int lmcs_chroma_adj, 
  enum uvg_tree_type tree_type);
+
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
  uvg_pixel *rec_out, coeff_t *coeff_out,
  bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type);
+
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
  int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
+
+typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights);

 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
    fprintf(stderr, "uvg_strategy_register_encode failed!\n");
    return 0;
  }
+  if (!uvg_strategy_register_depquant(&strategies, bitdepth)) {
+    fprintf(stderr, "uvg_strategy_register_depquant failed!\n");
+    return 0;
+  }
  
  while(cur_strategy_to_select->fptr) {
    *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st
 #include "strategies/strategies-intra.h"
 #include "strategies/strategies-sao.h"
 #include "strategies/strategies-encode.h"
+#include "strategies/strategies-depquant.h"
 #include "strategies/strategies-alf.h"

 static const strategy_to_select_t strategies_to_select[] = {
@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = {
  STRATEGIES_SAO_EXPORTS
  STRATEGIES_ENCODE_EXPORTS
  STRATEGIES_ALF_EXPORTS
+  STRATEGIES_DEPQUANT_EXPORTS
  { NULL, NULL },
 };

--- a/src/tables.c
+++ b/src/tables.c
--- a/src/tables.h
+++ b/src/tables.h
@ -134,6 +134,15 @@ typedef enum
 */
 extern const uint32_t* const uvg_g_sig_last_scan[3][5];
 extern const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1];
+extern const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1];
 extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];

+#define SCAN_GROUP_TYPES 2
+#define MAX_LOG2_INDEX 7
+
+#define SCAN_GROUP_UNGROUPED 0
+#define SCAN_GROUP_4X4 1
+
+const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
+
 #endif //TABLES_H_
--- a/src/transform.c
+++ b/src/transform.c
--- a/src/transform.h
+++ b/src/transform.h
@ -44,23 +44,28 @@
 #include "global.h" // IWYU pragma: keep

 extern const uint8_t uvg_g_chroma_scale[58];
-extern const int16_t uvg_g_inv_quant_scales[6];
-extern const int16_t uvg_g_quant_scales[6];
+extern const int16_t uvg_g_inv_quant_scales[2][6];
+extern const int16_t uvg_g_quant_scales[2][6];

-void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
-void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
+#define COEFF_ORDER_LINEAR 0
+#define COEFF_ORDER_CU 1
+
+void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
+void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);

 void uvg_transform2d(const encoder_control_t * const encoder,
                     int16_t *block,
                     int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                     color_t color,
                     const cu_info_t *tu);

 void uvg_itransform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                      int8_t block_size,
+                      int8_t block_width,
+                      int8_t block_height,
                      color_t color,
                      const cu_info_t *tu);

@ -69,11 +74,12 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con

 void uvg_derive_lfnst_constraints(
  cu_info_t* const pred_cu,
-  const int depth,
  bool* constraints,
  const coeff_t* coeff,
  const int width,
-  const int height);
+  const int height,
+  const vector2d_t * const ,
+  color_t color);

 typedef struct {
  double best_u_cost;
@ -82,6 +88,10 @@ typedef struct {
  int best_u_index;
  int best_v_index;
  int best_combined_index;
+  uint64_t u_distortion;
+  uint64_t v_distortion;
+  double   u_bits;
+  double   v_bits;
 } uvg_chorma_ts_out_t;

 void uvg_quantize_lcu_residual(
@ -89,9 +99,7 @@ void uvg_quantize_lcu_residual(
  bool luma,
  bool chroma,
  const bool jccr,
-  int32_t x,
-  int32_t y,
-  uint8_t depth,
+  const cu_loc_t* cu_loc,
  cu_info_t *cur_cu,
  lcu_t* lcu,
  bool early_skip,
@ -99,13 +107,10 @@ void uvg_quantize_lcu_residual(

 void uvg_chroma_transform_search(
  encoder_state_t* const state,
-  int depth,
  lcu_t* const lcu,
  cabac_data_t* temp_cabac,
-  int8_t width,
-  int8_t height,
+  const cu_loc_t* const cu_loc,
  const int offset,
-  const uint8_t mode,
  cu_info_t* pred_cu,
  uvg_pixel u_pred[1024],
  uvg_pixel v_pred[1024],
@ -130,7 +135,8 @@ void uvg_fwd_lfnst(
  const color_t color,
  const uint16_t lfnst_idx,
  coeff_t *coeffs,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode);

 void uvg_inv_lfnst(
  const cu_info_t* cur_cu,
@ -139,6 +145,7 @@ void uvg_inv_lfnst(
  const color_t color,
  const uint16_t lfnst_idx,
  coeff_t* coeffs,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  int8_t luma_mode);

 #endif
--- a/src/uvg266.h
+++ b/src/uvg266.h
@ -338,7 +338,6 @@ typedef struct uvg_config
  int32_t trskip_max_size;    /*!< \brief Transform skip max block size. */
  enum uvg_mts mts;        /*< \brief flag to enable multiple transform selection*/
  int32_t mts_implicit;        /*< \brief flag to enable implicit multiple transform selection*/
-  int32_t tr_depth_intra; /*!< \brief Maximum transform depth for intra. */
  enum uvg_ime_algorithm ime_algorithm;  /*!< \brief Integer motion estimation algorithm. */
  int32_t fme_level;      /*!< \brief Fractional pixel motion estimation level (0: disabled, 1: enabled). */
  int8_t source_scan_type; /*!< \brief Source scan type (0: progressive, 1: top field first, 2: bottom field first).*/
@ -526,6 +525,8 @@ typedef struct uvg_config
  /** \brief enable low frequency non-separable transform */
  int8_t lfnst;

+  /** \brief enable intra sub partitions*/
+  int8_t isp;

  int8_t jccr;

@ -542,9 +543,16 @@ typedef struct uvg_config

  uint8_t dual_tree;

+  uint8_t min_qt_size[3]; /* intra, inter, dual tree chroma*/
+  uint8_t max_bt_size[3]; /* intra, inter, dual tree chroma*/
+  uint8_t max_tt_size[3]; /* intra, inter, dual tree chroma*/
+
+  uint8_t max_btt_depth[3]; /* intra, inter, dual tree chroma*/
+
  uint8_t intra_rough_search_levels;

  uint8_t ibc; /* \brief Intra Block Copy parameter */
+  uint8_t dep_quant;
 } uvg_config;

 /**
--- a/src/videoframe.c
+++ b/src/videoframe.c
@ -61,7 +61,7 @@ videoframe_t * uvg_videoframe_alloc(int32_t width,
    frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
    if (cclm) {
      assert(chroma_format == UVG_CSP_420);
-      frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+      frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 15) & ~7) + FRAME_PADDING_LUMA) / 4);
      frame->cclm_luma_rec_top_line = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64));
    }
  }
--- a/tests/check_cabac_state_consistency.py
+++ b/tests/check_cabac_state_consistency.py
@ -30,7 +30,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
    with open(state_file, "rb") as file:
        try:
            while True:
-                type_, x, y, depth, tree_type = file.read(15).decode().split()
+                type_, x, y, depth, tree_type = file.read(23).decode().split()
                # Reset stored data at the beginning of the frame
                if x == '0' and y == '0' and type_ == "S" and tree_type != "2":
                    if not was_zero_last:
@ -38,7 +38,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
                        ctx_store = dict()
                        e_store = set()
                    was_zero_last = True
-                else:
+                elif int(x) >= 64 and int(y) >= 64:
                    was_zero_last = False

                ctx = file.read(ctx_count * ctx_size)
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@ -111,7 +111,8 @@ static void setup_tests()
          tu.tr_idx = MTS_DST7_DST7 + trafo;
          tu.lfnst_idx = 0;
          tu.cr_lfnst_idx = 0;
-          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
+          tu.intra.isp_mode = 0;
+          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
        }
      }      
    }
@ -134,7 +135,8 @@ static void setup_tests()
          tu.tr_idx = MTS_DST7_DST7 + trafo;
          tu.lfnst_idx = 0;
          tu.cr_lfnst_idx = 0;
-          idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
+          tu.intra.isp_mode = 0;
+          idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
        }
      }
      
@ -156,6 +158,7 @@ TEST dct(void)
 {
  char testname[100];
  for (int blocksize = 0; blocksize < NUM_SIZES; blocksize++) {
+    size_t size = 1 << (LCU_MIN_LOG_W + blocksize);
    for (int trafo = 0; trafo < NUM_TRANSFORM; trafo++) {      
      sprintf(testname, "Block: %d x %d, trafo: %d", 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), trafo);
      cu_info_t tu;
@ -163,15 +166,21 @@ TEST dct(void)
      tu.tr_idx = MTS_DST7_DST7 + trafo;
      tu.lfnst_idx = 0;
      tu.cr_lfnst_idx = 0;
+      tu.intra.isp_mode = 0;

      int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
      ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };

-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);

-      for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
+      for (int y = 0; y < size; ++y) {
+        if (y>= 16) break;
+        for (int x = 0; x < size; ++x) {
+          if (x >= 16) break;
+          int i = y * size + x;
          ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
        }
+      }
      //fprintf(stderr, "PASS: %s\r\n", testname);
    }
  }
@ -188,11 +197,14 @@ TEST idct(void)
      cu_info_t tu;
      tu.type = CU_INTRA;
      tu.tr_idx = MTS_DST7_DST7 + trafo;
+      tu.lfnst_idx = 0;
+      tu.cr_lfnst_idx = 0;
+      tu.intra.isp_mode = 0;

      int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
      ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };

-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);

      for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
        ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);
--- a/tests/mv_cand_tests.c
+++ b/tests/mv_cand_tests.c
@ -46,8 +46,11 @@ TEST test_get_spatial_merge_cand(void)

  merge_candidates_t cand = { 0 };

-  get_spatial_merge_candidates(64 + 32, 64, // x, y
-                               32, 24,      // width, height
+  cu_loc_t cu_loc;
+  uvg_cu_loc_ctor(&cu_loc, 64 + 32, 64, // x, y
+    32, 24); // width, height)
+
+  get_spatial_merge_candidates(&cu_loc,      
                               1920, 1080,  // picture size
                               &lcu,
                               &cand,
--- a/tests/test_cabac_state.sh
+++ b/tests/test_cabac_state.sh
@ -6,10 +6,10 @@ set -eu

 cabacfile="$(mktemp)"

-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"

-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"

 rm -rf "${cabacfile}"
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@ -19,3 +19,5 @@ valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
 valgrind_test $common_args --rd=3 --cclm --jccr
 valgrind_test $common_args --lfnst
 valgrind_test $common_args --lfnst --rd=3 --cclm --mip --dual-tree --fast-residual-cost 0
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --fast-residual-cost 0
+valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra --fast-residual-cost 0
--- a/tests/test_mtt.sh
+++ b/tests/test_mtt.sh
@ -0,0 +1,14 @@
+#!/bin/sh
+
+# Test all-intra coding.
+
+set -eu
+
+. "${0%/*}/util.sh"
+
+common_args='264x130 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-cpuid --no-wpp --fast-residual-cost 0'
+valgrind_test $common_args --rd=0 --mtt-depth-intra 1 --pu-depth-intra 2-3
+valgrind_test $common_args --rd=3 --mtt-depth-intra 1 --pu-depth-intra 0-5
+valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --pu-depth-intra 0-8
+valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8
+valgrind_test $common_args --rd=3 --rdoq --jccr --isp --lfnst --mip --mrl --mts intra --cclm --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8
--- a/tools/generate_tables.c
+++ b/tools/generate_tables.c
@ -51,7 +51,7 @@ static void init_sig_last_scan(uint32_t *buff_d, uint32_t *buff_h,
                               uint32_t *buff_v,
                               int32_t width, int32_t height)
 {
-  uint32_t num_scan_pos  = width * width;
+  uint32_t num_scan_pos  = width * height;
  uint32_t next_scan_pos = 0;
  int32_t  xx, yy, x, y;
  uint32_t scan_line;