From 626c9b02eaf45d6e5923a351a9ce879ecde546d9 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 3 Aug 2022 13:23:27 +0300
Subject: [PATCH] [isp] Modify transform and quantization functions to handle
 non-square blocks. Add strategy headers to CMakelist.

---
 CMakeLists.txt                           |  2 +-
 src/cu.h                                 |  4 +-
 src/search.c                             | 28 +++++-----
 src/search_inter.c                       |  2 +
 src/search_intra.c                       | 15 +++---
 src/strategies/avx2/dct-avx2.c           |  4 +-
 src/strategies/avx2/intra-avx2.c         |  3 ++
 src/strategies/avx2/picture-avx2.c       |  4 +-
 src/strategies/avx2/quant-avx2.c         | 12 ++---
 src/strategies/generic/dct-generic.c     | 18 +++++--
 src/strategies/generic/picture-generic.c |  4 +-
 src/strategies/generic/quant-generic.c   | 35 ++++++------
 src/strategies/generic/quant-generic.h   |  3 +-
 src/strategies/strategies-dct.c          | 10 +++-
 src/strategies/strategies-dct.h          |  4 +-
 src/strategies/strategies-picture.h      |  2 +-
 src/strategies/strategies-quant.h        | 20 +++++--
 src/transform.c                          | 68 ++++++++++++++----------
 src/transform.h                          |  5 +-
 tests/mts_tests.c                        |  6 +--
 20 files changed, 153 insertions(+), 96 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0ec99c7..ab0b63a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
 list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")
 
 # Add also all the strategies
-file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
+file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")
 
 # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC
 list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c")
diff --git a/src/cu.h b/src/cu.h
index 6fe960e7..f5eeb5e6 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -415,9 +415,9 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
  */
 static INLINE void copy_coeffs(const coeff_t *__restrict src,
                                coeff_t *__restrict dest,
-                               size_t width)
+                               size_t width, size_t height)
 {
-  memcpy(dest, src, width * width * sizeof(coeff_t));
+  memcpy(dest, src, width * height * sizeof(coeff_t));
 }
 
 
diff --git a/src/search.c b/src/search.c
index 4fbf33f3..56e07b06 100644
--- a/src/search.c
+++ b/src/search.c
@@ -89,20 +89,20 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr
   }
 }
 
-static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint, enum
+static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum
                                   uvg_tree_type tree_type)
 {
   if (tree_type != UVG_CHROMA_T) {
-    const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
-    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width);
+    const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y);
+    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], cu_loc->width, cu_loc->height);
   }
 
   if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> (tree_type != UVG_CHROMA_T), y_local >> (tree_type != UVG_CHROMA_T));
-    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1);
-    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1);
+    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
+    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
+    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
     if (joint) {
-      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1);
+      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
     }
   }
 }
@@ -114,9 +114,11 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t
                               uvg_tree_type tree_type)
 {
   const int width = LCU_WIDTH >> depth;
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_local, y_local, width, width);
   copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
   copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
-  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
+  copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
   
 }
 
@@ -1093,7 +1095,7 @@ static double search_cu(
 
         }
         cu_loc_t loc;
-        const int width = LCU_WIDTH << depth;
+        const int width = LCU_WIDTH >> depth;
         const int height = width; // TODO: height for non-square blocks
         uvg_cu_loc_ctor(&loc, x, y, width, height);
         uvg_quantize_lcu_residual(state,
@@ -1579,7 +1581,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
   copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type);
 
   // Copy coeffs to encoder state.
-  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH);
+  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH);
 
   if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
     cost = search_cu(
@@ -1596,9 +1598,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
     copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T);
   }
 
-  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C);
-  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C);
   if (state->encoder_control->cfg.jccr) {
-    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C);
+    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C);
   }
 }
diff --git a/src/search_inter.c b/src/search_inter.c
index 7922f34b..ff511740 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2225,6 +2225,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       u_pred,
       u_resi,
       width,
+      height,
       LCU_WIDTH_C,
       width);
     uvg_generate_residual(
@@ -2232,6 +2233,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
       v_pred,
       v_resi,
       width,
+      height,
       LCU_WIDTH_C,
       width);
 
diff --git a/src/search_intra.c b/src/search_intra.c
index f3c8c838..06b86cc7 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -249,8 +249,11 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 
 
 // ISP_TODO: move this function if it is used elsewhere
-bool can_use_isp(const int width, const int height, const int max_tr_size)
+static INLINE bool can_use_isp(const int width, const int height, const int max_tr_size)
 {
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
+  assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
+
   const int log2_width = uvg_g_convert_to_bit[width] + 2;
   const int log2_height = uvg_g_convert_to_bit[height] + 2;
 
@@ -300,16 +303,14 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ
 
 
 // ISP_TODO: move this function if it is used elsewhere
-bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
+static INLINE bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
 {
   if (isp_mode == ISP_MODE_NO_ISP) {
     return false;
   }
   const int tu_width = isp_mode == ISP_MODE_HOR ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
   const int tu_height = isp_mode == ISP_MODE_HOR ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
-
-  // ISP_TODO: make a define for this or use existing
-  const int min_tb_size = 4;
+  const int min_tb_size = TR_MIN_WIDTH; 
 
   if (!(tu_width >= min_tb_size && tu_height >= min_tb_size)) {
     return false;
@@ -1449,7 +1450,7 @@ static int8_t search_intra_rdo(
   enum uvg_tree_type tree_type)
 {
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
-  const int width = LCU_WIDTH << depth;
+  const int width = LCU_WIDTH >> depth;
   const int height = width; // TODO: height for non-square blocks
   
   for (int mode = 0; mode < modes_to_check; mode++) {
@@ -1633,6 +1634,7 @@ int8_t uvg_search_intra_chroma_rdo(
             u_pred,
             u_resi,
             width,
+            height,
             LCU_WIDTH_C,
             width);
           uvg_generate_residual(
@@ -1640,6 +1642,7 @@ int8_t uvg_search_intra_chroma_rdo(
             v_pred,
             v_resi,
             width,
+            height,
             LCU_WIDTH_C,
             width);
           uvg_chorma_ts_out_t chorma_ts_out;
diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index b695273b..f3c812ed 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -1590,18 +1590,20 @@ static void mts_dct_avx2(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
+  // ISP_TODO: height passed but not used
 
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
   if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx)
   {
-    dct_func* dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
   else
diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index 79e60def..fc19654a 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -61,6 +61,7 @@ static void uvg_angular_pred_avx2(
   uvg_pixel *const dst,
   const uint8_t multi_ref_idx)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width = uvg_g_convert_to_bit[width] + 2;
@@ -512,6 +513,7 @@ static void uvg_intra_pred_planar_avx2(
   const uint8_t *const ref_left,
   uint8_t *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   const int log2_width = uvg_g_convert_to_bit[width] + 2;
@@ -977,6 +979,7 @@ static void uvg_pdpc_planar_dc_avx2(
   const uvg_intra_ref *const used_ref,
   uvg_pixel *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   assert(mode == 0 || mode == 1);  // planar or DC
   const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
   const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index df90f149..a911928d 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -1743,8 +1743,8 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
   return diff;
 }
 
-static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
-
+static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) {
+  // ISP_TODO: non-square block implementation, height is passed but not used
   __m128i diff = _mm_setzero_si128();
   switch (width) {
   case 4:
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 5c39fe11..8313b1f0 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -626,7 +626,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_avx2(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uint8_t *const ref_in, const uint8_t *const pred_in,
@@ -637,15 +637,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
   // Temporary arrays to pass data to and from uvg_quant and transform functions.
   ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
-  const int height = width; // TODO: height for non-square blocks
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  
   int has_coeffs = 0;
 
   assert(width <= TR_MAX_WIDTH);
   assert(width >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
     int y, x;
@@ -662,10 +662,10 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
   }
   else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
   const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c
index cd05a01f..00562737 100644
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@@ -739,6 +739,11 @@ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, const int16_t *inp
   partial_butterfly_inverse_ ## n ## _generic(tmp, output, shift_2nd); \
 }
 
+static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output)
+{
+  // ISP_TODO: non-square transform here
+}
+
 DCT_NXN_GENERIC(4);
 DCT_NXN_GENERIC(8);
 DCT_NXN_GENERIC(16);
@@ -2487,26 +2492,28 @@ static void mts_dct_generic(
   const color_t color,
   const cu_info_t* tu,
   const int8_t width,
+  const int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx)
 {
   tr_type_t type_hor;
   tr_type_t type_ver;
+  // ISP_TODO: height passed but not used
 
   uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
 
-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
   {
-    dct_func *dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
     dct_func(bitdepth, input, output);
   }
   else
   {
-    const int height = width;
     int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
     int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
     const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    const int log2_height_minus2 = uvg_g_convert_to_bit[height];
     if(tu->lfnst_idx || tu->cr_lfnst_idx) {
       if ((width == 4 && height > 4) || (width > 4 && height == 4))
       {
@@ -2521,11 +2528,11 @@ static void mts_dct_generic(
     }
 
     partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2];
+    partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus2];
 
     int16_t tmp[32 * 32];
     const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
-    const int32_t shift_2nd = log2_width_minus2 + 8;
+    const int32_t shift_2nd = log2_height_minus2 + 8;
 
     dct_hor(input, tmp, shift_1st, height, 0, skip_width);
     dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
@@ -2582,6 +2589,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
   success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
   success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
   success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
+  success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);
 
   success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);
 
diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c
index 817befed..6797a669 100644
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@@ -783,10 +783,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)
 
 
 static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, 
-  int width, int ref_stride, int pred_stride)
+  int width, int height, int ref_stride, int pred_stride)
 {
   int y, x;
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
     }
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 96d2567a..03d4daf8 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -237,6 +237,7 @@ int uvg_quant_cbcr_residual_generic(
   encoder_state_t* const state, 
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in, 
@@ -247,28 +248,28 @@ int uvg_quant_cbcr_residual_generic(
   uvg_pixel* v_rec_out,
   coeff_t* coeff_out,
   bool early_skip, 
-  int lmcs_chroma_adj, enum uvg_tree_type tree_type
-  ) {
+  int lmcs_chroma_adj, enum uvg_tree_type tree_type) 
+{
   ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
   ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
+  // ISP_TODO: this function is not fully converted to handle non-square blocks
   {
     int y, x;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
         v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
       }
     }
   }
-  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
-  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
+  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride);
+  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride);
   
   
   const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
-  for (int y = 0; y < width; y++)
+  for (int y = 0; y < height; y++)
   {
     for (int x = 0; x < width; x++)
     {
@@ -305,9 +306,9 @@ int uvg_quant_cbcr_residual_generic(
   }
 
 
-  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
   if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
   }
 
   if (state->encoder_control->cfg.rdoq_enable &&
@@ -441,7 +442,7 @@ int uvg_quant_cbcr_residual_generic(
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@@ -454,19 +455,17 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   int has_coeffs = 0;
 
-  assert(width <= TR_MAX_WIDTH);
-  assert(width >= TR_MIN_WIDTH);
-
-  const int height = width; // TODO: height for non-square blocks
+  assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
+  assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);
 
   // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
 
   if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
     int y, x;
     int sign, absval;
     int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         sign = residual[x + y * width] >= 0 ? 1 : -1;
         absval = sign * residual[x + y * width];
@@ -477,10 +476,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
   }
   else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
   }
 
   const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h
index da2b05ae..ba1fa130 100644
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@@ -60,7 +60,7 @@ void uvg_quant_generic(
   uint8_t lfnst_idx);
 
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@@ -71,6 +71,7 @@ int uvg_quant_cbcr_residual_generic(
   encoder_state_t* const state,
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in,
diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c
index 4ba2a37b..07f0fcb4 100644
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0;
 dct_func * uvg_dct_8x8 = 0;
 dct_func * uvg_dct_16x16 = 0;
 dct_func * uvg_dct_32x32 = 0;
+dct_func * uvg_dct_non_square = 0;
 
 dct_func * uvg_fast_inverse_dst_4x4 = 0;
 
@@ -56,9 +57,11 @@ void(*uvg_mts_dct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
   int8_t width,
+  int8_t height,
   const int16_t *input,
   int16_t *output,
   const int8_t mts_idx);
+
 void(*uvg_mts_idct)(int8_t bitdepth,
   color_t color,
   const cu_info_t *tu,
@@ -90,8 +93,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
  *
  * \returns Pointer to the function.
  */
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
+    return uvg_dct_non_square;
+  }
   switch (width) {
   case 4:
     //if (color == COLOR_Y && type == CU_INTRA) {
diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h
index d58bf5a9..50cc3b5a 100644
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4;
 extern dct_func * uvg_dct_8x8;
 extern dct_func * uvg_dct_16x16;
 extern dct_func * uvg_dct_32x32;
+extern dct_func * uvg_dct_non_square;
 
 extern dct_func * uvg_fast_inverse_dst_4x4;
 
@@ -64,6 +65,7 @@ typedef void (mts_dct_func)(
   color_t color,
   const cu_info_t* tu,
   int8_t width,
+  int8_t height,
   const int16_t* input,
   int16_t* output,
   const int8_t mts_idx);
@@ -82,7 +84,7 @@ typedef void (mts_idct_func)(
 extern mts_idct_func* uvg_mts_idct;
 
 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type);
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
 dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);
 
 
diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h
index 88f52cfc..8d73f74c 100644
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
 
 typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);
 
-typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
+typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride);
 
 
 extern const uint32_t uvg_crc_table[256];
diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h
index a6c9a3d4..2920ed82 100644
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@@ -45,12 +45,23 @@
 #include "tables.h"
 
 // Declare function pointers.
-typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx);
+typedef unsigned (quant_func)(
+  const encoder_state_t * const state, 
+  coeff_t *coef, 
+  coeff_t *q_coef, 
+  int32_t width,
+  int32_t height, 
+  color_t color, 
+  int8_t scan_idx, 
+  int8_t block_type, 
+  int8_t transform_skip, 
+  uint8_t lfnst_idx);
+
 typedef unsigned (quant_cbcr_func)(
   encoder_state_t* const state,
   const cu_info_t* const cur_cu,
   const int width,
+  const int height,
   const coeff_scan_order_t scan_order,
   const int in_stride, const int out_stride,
   const uvg_pixel* const u_ref_in,
@@ -63,15 +74,18 @@ typedef unsigned (quant_cbcr_func)(
   bool early_skip,
   int lmcs_chroma_adj, 
   enum uvg_tree_type tree_type);
+
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
   uvg_pixel *rec_out, coeff_t *coeff_out,
   bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type);
+
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
   int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
+
 typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
diff --git a/src/transform.c b/src/transform.c
index abf793c2..0f73eeeb 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -77,6 +77,7 @@ const uint8_t uvg_g_chroma_scale[58]=
  * Parameters pred_in and rec_out may be aliased.
  *
  * \param width       Transform width.
+ * \param height      Transform height.
  * \param in_stride   Stride for ref_in and pred_in
  * \param out_stride  Stride for rec_out.
  * \param ref_in      Reference pixels.
@@ -87,6 +88,7 @@ const uint8_t uvg_g_chroma_scale[58]=
  * \returns  Whether coeff_out contains any non-zero coefficients.
  */
 static bool bypass_transquant(const int width,
+                              const int height,
                               const int in_stride,
                               const int out_stride,
                               const uvg_pixel *const ref_in,
@@ -96,7 +98,7 @@ static bool bypass_transquant(const int width,
 {
   bool nonzero_coeffs = false;
 
-  for (int y = 0; y < width; ++y) {
+  for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       int32_t in_idx    = x + y * in_stride;
       int32_t out_idx   = x + y * out_stride;
@@ -123,6 +125,7 @@ static bool bypass_transquant(const int width,
  * \param coeff   coefficients (residual) to filter
  */
 static void rdpcm(const int width,
+                  const int height,
                   const rdpcm_dir dir,
                   coeff_t *coeff)
 {
@@ -130,7 +133,7 @@ static void rdpcm(const int width,
   const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
   const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;
 
-  for (int y = width - 1; y >= min_y; y--) {
+  for (int y = height - 1; y >= min_y; y--) {
     for (int x = width - 1; x >= min_x; x--) {
       const int index = x + y * width;
       coeff[index] -= coeff[index - offset];
@@ -203,17 +206,18 @@ void uvg_derive_lfnst_constraints(
 
 /**
  * \brief NxN inverse transform (2D)
- * \param coeff input data (transform coefficients)
- * \param block output data (residual)
- * \param block_size input data (width of transform)
+ * \param coeff   input data (transform coefficients)
+ * \param block   output data (residual)
+ * \param width   transform width
+ * \param height  transform height
  */
-void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size)
+void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height)
 {
-  int32_t  j,k;
-  for (j = 0; j < block_size; j++) {
-    for(k = 0; k < block_size; k ++) {
+  int32_t j, k;
+  for (j = 0; j < height; j++) {
+    for(k = 0; k < width; k ++) {
       // Casting back and forth to make UBSan not trigger due to left-shifting negatives
-      coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k]));
+      coeff[j * width + k] = (int16_t)((uint16_t)(block[j * width + k]));
     }
   }
 }
@@ -243,17 +247,18 @@ void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                      color_t color,
                      const cu_info_t *tu)
 {
-  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx)
+  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx || block_width != block_height)
   {
-    uvg_mts_dct(encoder->bitdepth, color, tu, block_size, block, coeff, encoder->cfg.mts);
+    uvg_mts_dct(encoder->bitdepth, color, tu, block_width, block_height, block, coeff, encoder->cfg.mts);
   }
   else
   {
-    dct_func *dct_func = uvg_get_dct_func(block_size, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(block_width, block_height, color, tu->type);
     dct_func(encoder->bitdepth, block, coeff);
   }
 }
@@ -373,6 +378,7 @@ static void generate_jccr_transforms(
       &temp_resi[(cbf_mask1 - 1) * trans_offset],
       &u_coeff[*num_transforms * trans_offset],
       width,
+      height,
       COLOR_U,
       pred_cu
     );
@@ -386,6 +392,7 @@ static void generate_jccr_transforms(
       &temp_resi[(cbf_mask2 - 1) * trans_offset],
       &u_coeff[*num_transforms * trans_offset],
       width,
+      height,
       COLOR_U,
       pred_cu
     );
@@ -492,10 +499,10 @@ void uvg_chroma_transform_search(
   ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
   ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
   uvg_transform2d(
-    state->encoder_control, u_resi, u_coeff, width, COLOR_U, pred_cu
+    state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
   );
   uvg_transform2d(
-    state->encoder_control, v_resi, v_coeff, width, COLOR_V, pred_cu
+    state->encoder_control, v_resi, v_coeff, width, height, COLOR_V, pred_cu
   );
   enum uvg_chroma_transforms transforms[5];
   transforms[0] = DCT7_CHROMA;
@@ -508,8 +515,8 @@ void uvg_chroma_transform_search(
     pred_cu->cr_lfnst_idx == 0 ;
 
   if (can_use_tr_skip) {
-    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width);
-    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width);
+    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width, height);
+    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width, height);
     transforms[num_transforms] = CHROMA_TS;
     num_transforms++;
   }
@@ -1053,7 +1060,7 @@ void uvg_inv_lfnst(
  */
 int uvg_quantize_residual_trskip(
     encoder_state_t *const state,
-    const cu_info_t *const cur_cu, const int width, const color_t color,
+    const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
     const coeff_scan_order_t scan_order, int8_t *trskip_out, 
     const int in_stride, const int out_stride,
     const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, 
@@ -1074,7 +1081,7 @@ int uvg_quantize_residual_trskip(
   //noskip.cost += uvg_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;
 
   skip.has_coeffs = uvg_quantize_residual(
-    state, cur_cu, width, color, scan_order,
+    state, cur_cu, width, height, color, scan_order,
     1, in_stride, width,
     ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj, 
     UVG_BOTH_T /* tree type doesn't matter for transformskip*/);
@@ -1090,9 +1097,9 @@ int uvg_quantize_residual_trskip(
   if (best->has_coeffs || rec_out != pred_in) {
     // If there is no residual and reconstruction is already in rec_out, 
     // we can skip this.
-    uvg_pixels_blit(best->rec, rec_out, width, width, width, out_stride);
+    uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride);
   }
-  copy_coeffs(best->coeff, coeff_out, width);
+  copy_coeffs(best->coeff, coeff_out, width, height);
 
   return best->has_coeffs;
 }
@@ -1131,8 +1138,8 @@ static void quantize_tr_residual(
   // This should ensure that the CBF data doesn't get corrupted if this function
   // is called more than once.
 
-  int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
-  int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
   
   const int32_t lcu_width = LCU_WIDTH >> shift;
   const int8_t mode =
@@ -1183,7 +1190,9 @@ static void quantize_tr_residual(
   }
 
   if (cfg->lossless) {
+    // ISP_TODO: is there any sensible case where in and out strides would be different?
     has_coeffs = bypass_transquant(tr_width,
+                                   tr_height,
                                    lcu_width, // in stride
                                    lcu_width, // out stride
                                    ref,
@@ -1193,9 +1202,9 @@ static void quantize_tr_residual(
     if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
       // implicit rdpcm for horizontal and vertical intra modes
       if (mode == 18) {
-        rdpcm(tr_width, RDPCM_HOR, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_HOR, coeff);
       } else if (mode == 50) {
-        rdpcm(tr_width, RDPCM_VER, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_VER, coeff);
       }
     }
 
@@ -1206,6 +1215,7 @@ static void quantize_tr_residual(
     has_coeffs = uvg_quantize_residual_trskip(state,
                                               cur_pu,
                                               tr_width,
+                                              tr_height,
                                               color,
                                               scan_idx,
                                               &tr_skip,
@@ -1222,6 +1232,7 @@ static void quantize_tr_residual(
         state,
         cur_pu,
         tr_width,
+        tr_height,
         scan_idx,
         lcu_width,
         lcu_width,
@@ -1240,6 +1251,7 @@ static void quantize_tr_residual(
     has_coeffs = uvg_quantize_residual(state,
                                        cur_pu,
                                        tr_width,
+                                       tr_height,
                                        color,
                                        scan_idx,
                                        false, // tr skip
@@ -1326,8 +1338,8 @@ void uvg_quantize_lcu_residual(
     const int offset = width / 2;
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 2; ++i) {
-        const cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width, height);
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width >> 1, height >> 1);
         // jccr is currently not supported if transform is split
         uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, depth + 1, NULL, lcu, early_skip, tree_type);
       }
diff --git a/src/transform.h b/src/transform.h
index 61c50c04..a7b6e221 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -47,13 +47,14 @@ extern const uint8_t uvg_g_chroma_scale[58];
 extern const int16_t uvg_g_inv_quant_scales[6];
 extern const int16_t uvg_g_quant_scales[6];
 
-void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
+void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
 
 void uvg_transform2d(const encoder_control_t * const encoder,
                      int16_t *block,
                      int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                      color_t color,
                      const cu_info_t *tu);
 
diff --git a/tests/mts_tests.c b/tests/mts_tests.c
index f607b77d..2a132c77 100644
--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@@ -111,7 +111,7 @@ static void setup_tests()
           tu.tr_idx = MTS_DST7_DST7 + trafo;
           tu.lfnst_idx = 0;
           tu.cr_lfnst_idx = 0;
-          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
+          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
         }
       }      
     }
@@ -167,7 +167,7 @@ TEST dct(void)
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
 
-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
       for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
         ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
@@ -192,7 +192,7 @@ TEST idct(void)
       int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
       ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
 
-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
 
       for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
         ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);