[isp] Modify transform and quantization functions to handle non-square blocks. Add strategy headers to CMakelist.

2024-11-23 18:14:06 +00:00 · 2022-08-03 13:23:27 +03:00 · 2022-08-03 13:23:27 +03:00 · 626c9b02ea
parent 06532dce02
commit 626c9b02ea
20 changed files with 153 additions and 96 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
 list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")

 # Add also all the strategies
-file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
+file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")

 # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC
 list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c")
--- a/src/cu.h
+++ b/src/cu.h
@ -415,9 +415,9 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
 */
 static INLINE void copy_coeffs(const coeff_t *__restrict src,
                               coeff_t *__restrict dest,
-                               size_t width)
+                               size_t width, size_t height)
 {
-  memcpy(dest, src, width * width * sizeof(coeff_t));
+  memcpy(dest, src, width * height * sizeof(coeff_t));
 }


--- a/src/search.c
+++ b/src/search.c
@ -89,20 +89,20 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr
  }
 }

-static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint, enum
+static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum
                                  uvg_tree_type tree_type)
 {
  if (tree_type != UVG_CHROMA_T) {
-    const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
-    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width);
+    const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y);
+    copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], cu_loc->width, cu_loc->height);
  }

  if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
-    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> (tree_type != UVG_CHROMA_T), y_local >> (tree_type != UVG_CHROMA_T));
-    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1);
-    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1);
+    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
+    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
+    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
    if (joint) {
-      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1);
+      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height);
    }
  }
 }
@ -114,9 +114,11 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t
                              uvg_tree_type tree_type)
 {
  const int width = LCU_WIDTH >> depth;
+  cu_loc_t loc;
+  uvg_cu_loc_ctor(&loc, x_local, y_local, width, width);
  copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
  copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
-  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
+  copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
  
 }

@ -1093,7 +1095,7 @@ static double search_cu(

        }
        cu_loc_t loc;
-        const int width = LCU_WIDTH << depth;
+        const int width = LCU_WIDTH >> depth;
        const int height = width; // TODO: height for non-square blocks
        uvg_cu_loc_ctor(&loc, x, y, width, height);
        uvg_quantize_lcu_residual(state,
@ -1579,7 +1581,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
  copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type);

  // Copy coeffs to encoder state.
-  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH);
+  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH);

  if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) {
    cost = search_cu(
@ -1596,9 +1598,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
    copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T);
  }

-  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C);
-  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C);
+  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C);
  if (state->encoder_control->cfg.jccr) {
-    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C);
+    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C);
  }
 }
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -2225,6 +2225,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
      u_pred,
      u_resi,
      width,
+      height,
      LCU_WIDTH_C,
      width);
    uvg_generate_residual(
@ -2232,6 +2233,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
      v_pred,
      v_resi,
      width,
+      height,
      LCU_WIDTH_C,
      width);

--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -249,8 +249,11 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,


 // ISP_TODO: move this function if it is used elsewhere
-bool can_use_isp(const int width, const int height, const int max_tr_size)
+static INLINE bool can_use_isp(const int width, const int height, const int max_tr_size)
 {
+  assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size.");
+  assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH.");
+
  const int log2_width = uvg_g_convert_to_bit[width] + 2;
  const int log2_height = uvg_g_convert_to_bit[height] + 2;

@ -300,16 +303,14 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ


 // ISP_TODO: move this function if it is used elsewhere
-bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
+static INLINE bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode)
 {
  if (isp_mode == ISP_MODE_NO_ISP) {
    return false;
  }
  const int tu_width = isp_mode == ISP_MODE_HOR ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER);
  const int tu_height = isp_mode == ISP_MODE_HOR ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height;
-
-  // ISP_TODO: make a define for this or use existing
-  const int min_tb_size = 4;
+  const int min_tb_size = TR_MIN_WIDTH; 

  if (!(tu_width >= min_tb_size && tu_height >= min_tb_size)) {
    return false;
@ -1449,7 +1450,7 @@ static int8_t search_intra_rdo(
  enum uvg_tree_type tree_type)
 {
  const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
-  const int width = LCU_WIDTH << depth;
+  const int width = LCU_WIDTH >> depth;
  const int height = width; // TODO: height for non-square blocks
  
  for (int mode = 0; mode < modes_to_check; mode++) {
@ -1633,6 +1634,7 @@ int8_t uvg_search_intra_chroma_rdo(
            u_pred,
            u_resi,
            width,
+            height,
            LCU_WIDTH_C,
            width);
          uvg_generate_residual(
@ -1640,6 +1642,7 @@ int8_t uvg_search_intra_chroma_rdo(
            v_pred,
            v_resi,
            width,
+            height,
            LCU_WIDTH_C,
            width);
          uvg_chorma_ts_out_t chorma_ts_out;
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@ -1590,18 +1590,20 @@ static void mts_dct_avx2(
  const color_t color,
  const cu_info_t* tu,
  const int8_t width,
+  const int8_t height,
  const int16_t* input,
  int16_t* output,
  const int8_t mts_idx)
 {
  tr_type_t type_hor;
  tr_type_t type_ver;
+  // ISP_TODO: height passed but not used

  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);

  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx)
  {
-    dct_func* dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type);
    dct_func(bitdepth, input, output);
  }
  else
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@ -61,6 +61,7 @@ static void uvg_angular_pred_avx2(
  uvg_pixel *const dst,
  const uint8_t multi_ref_idx)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
  const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
  const int log2_width = uvg_g_convert_to_bit[width] + 2;
@ -512,6 +513,7 @@ static void uvg_intra_pred_planar_avx2(
  const uint8_t *const ref_left,
  uint8_t *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
  const int log2_width = uvg_g_convert_to_bit[width] + 2;
@ -977,6 +979,7 @@ static void uvg_pdpc_planar_dc_avx2(
  const uvg_intra_ref *const used_ref,
  uvg_pixel *const dst)
 {
+  // ISP_TODO: non-square block implementation, height is passed but not used
  assert(mode == 0 || mode == 1);  // planar or DC
  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -1743,8 +1743,8 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
  return diff;
 }

-static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
-
+static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) {
+  // ISP_TODO: non-square block implementation, height is passed but not used
  __m128i diff = _mm_setzero_si128();
  switch (width) {
  case 4:
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -626,7 +626,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_avx2(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uint8_t *const ref_in, const uint8_t *const pred_in,
@ -637,15 +637,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
  // Temporary arrays to pass data to and from uvg_quant and transform functions.
  ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
-  const int height = width; // TODO: height for non-square blocks
+  // ISP_TODO: non-square block implementation, height is passed but not used
+  
  int has_coeffs = 0;

  assert(width <= TR_MAX_WIDTH);
  assert(width >= TR_MIN_WIDTH);

  // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
    int y, x;
@ -662,10 +662,10 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,

  // Transform residual. (residual -> coeff)
  if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
  }
  else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
  }

  const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
--- a/src/strategies/generic/dct-generic.c
+++ b/src/strategies/generic/dct-generic.c
@ -739,6 +739,11 @@ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, const int16_t *inp
  partial_butterfly_inverse_ ## n ## _generic(tmp, output, shift_2nd); \
 }

+static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output)
+{
+  // ISP_TODO: non-square transform here
+}
+
 DCT_NXN_GENERIC(4);
 DCT_NXN_GENERIC(8);
 DCT_NXN_GENERIC(16);
@ -2487,26 +2492,28 @@ static void mts_dct_generic(
  const color_t color,
  const cu_info_t* tu,
  const int8_t width,
+  const int8_t height,
  const int16_t* input,
  int16_t* output,
  const int8_t mts_idx)
 {
  tr_type_t type_hor;
  tr_type_t type_ver;
+  // ISP_TODO: height passed but not used

  uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);

-  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx)
+  if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height)
  {
-    dct_func *dct_func = uvg_get_dct_func(width, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
    dct_func(bitdepth, input, output);
  }
  else
  {
-    const int height = width;
    int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
    int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
    const int log2_width_minus2 = uvg_g_convert_to_bit[width];
+    const int log2_height_minus2 = uvg_g_convert_to_bit[height];
    if(tu->lfnst_idx || tu->cr_lfnst_idx) {
      if ((width == 4 && height > 4) || (width > 4 && height == 4))
      {
@ -2521,11 +2528,11 @@ static void mts_dct_generic(
    }

    partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
-    partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2];
+    partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus2];

    int16_t tmp[32 * 32];
    const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
-    const int32_t shift_2nd = log2_width_minus2 + 8;
+    const int32_t shift_2nd = log2_height_minus2 + 8;

    dct_hor(input, tmp, shift_1st, height, 0, skip_width);
    dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
@ -2582,6 +2589,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
  success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
  success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
  success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
+  success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);

  success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);

--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@ -783,10 +783,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)


 static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, 
-  int width, int ref_stride, int pred_stride)
+  int width, int height, int ref_stride, int pred_stride)
 {
  int y, x;
-  for (y = 0; y < width; ++y) {
+  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
    }
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -237,6 +237,7 @@ int uvg_quant_cbcr_residual_generic(
  encoder_state_t* const state, 
  const cu_info_t* const cur_cu,
  const int width,
+  const int height,
  const coeff_scan_order_t scan_order,
  const int in_stride, const int out_stride,
  const uvg_pixel* const u_ref_in, 
@ -247,28 +248,28 @@ int uvg_quant_cbcr_residual_generic(
  uvg_pixel* v_rec_out,
  coeff_t* coeff_out,
  bool early_skip, 
-  int lmcs_chroma_adj, enum uvg_tree_type tree_type
-  ) {
+  int lmcs_chroma_adj, enum uvg_tree_type tree_type) 
+{
  ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
-
+  // ISP_TODO: this function is not fully converted to handle non-square blocks
  {
    int y, x;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
        v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
      }
    }
  }
-  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
-  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
+  uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride);
+  uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride);
  
  
  const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
-  for (int y = 0; y < width; y++)
+  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
@ -305,9 +306,9 @@ int uvg_quant_cbcr_residual_generic(
  }


-  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
+  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
  if(cur_cu->cr_lfnst_idx) {
-    uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
+    uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
  }

  if (state->encoder_control->cfg.rdoq_enable &&
@ -441,7 +442,7 @@ int uvg_quant_cbcr_residual_generic(
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@ -454,19 +455,17 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,

  int has_coeffs = 0;

-  assert(width <= TR_MAX_WIDTH);
-  assert(width >= TR_MIN_WIDTH);
-
-  const int height = width; // TODO: height for non-square blocks
+  assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
+  assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);

  // Get residual. (ref_in - pred_in -> residual)
-  uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
+  uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
    int y, x;
    int sign, absval;
    int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
-    for (y = 0; y < width; ++y) {
+    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        sign = residual[x + y * width] >= 0 ? 1 : -1;
        absval = sign * residual[x + y * width];
@ -477,10 +476,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,

  // Transform residual. (residual -> coeff)
  if (use_trskip) {
-    uvg_transformskip(state->encoder_control, residual, coeff, width);
+    uvg_transformskip(state->encoder_control, residual, coeff, width, height);
  }
  else {
-    uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
+    uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
  }

  const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@ -60,7 +60,7 @@ void uvg_quant_generic(
  uint8_t lfnst_idx);

 int uvg_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@ -71,6 +71,7 @@ int uvg_quant_cbcr_residual_generic(
  encoder_state_t* const state,
  const cu_info_t* const cur_cu,
  const int width,
+  const int height,
  const coeff_scan_order_t scan_order,
  const int in_stride, const int out_stride,
  const uvg_pixel* const u_ref_in,
--- a/src/strategies/strategies-dct.c
+++ b/src/strategies/strategies-dct.c
@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0;
 dct_func * uvg_dct_8x8 = 0;
 dct_func * uvg_dct_16x16 = 0;
 dct_func * uvg_dct_32x32 = 0;
+dct_func * uvg_dct_non_square = 0;

 dct_func * uvg_fast_inverse_dst_4x4 = 0;

@ -56,9 +57,11 @@ void(*uvg_mts_dct)(int8_t bitdepth,
  color_t color,
  const cu_info_t *tu,
  int8_t width,
+  int8_t height,
  const int16_t *input,
  int16_t *output,
  const int8_t mts_idx);
+
 void(*uvg_mts_idct)(int8_t bitdepth,
  color_t color,
  const cu_info_t *tu,
@ -90,8 +93,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
 *
 * \returns Pointer to the function.
 */
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
 {
+  if (width != height) {
+    // Non-square block. Return generic dct for non-square blokcs.
+    assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
+    return uvg_dct_non_square;
+  }
  switch (width) {
  case 4:
    //if (color == COLOR_Y && type == CU_INTRA) {
--- a/src/strategies/strategies-dct.h
+++ b/src/strategies/strategies-dct.h
@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4;
 extern dct_func * uvg_dct_8x8;
 extern dct_func * uvg_dct_16x16;
 extern dct_func * uvg_dct_32x32;
+extern dct_func * uvg_dct_non_square;

 extern dct_func * uvg_fast_inverse_dst_4x4;

@ -64,6 +65,7 @@ typedef void (mts_dct_func)(
  color_t color,
  const cu_info_t* tu,
  int8_t width,
+  int8_t height,
  const int16_t* input,
  int16_t* output,
  const int8_t mts_idx);
@ -82,7 +84,7 @@ typedef void (mts_idct_func)(
 extern mts_idct_func* uvg_mts_idct;

 int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
-dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type);
+dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
 dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);


--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,

 typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);

-typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
+typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride);


 extern const uint32_t uvg_crc_table[256];
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@ -45,12 +45,23 @@
 #include "tables.h"

 // Declare function pointers.
-typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx);
+typedef unsigned (quant_func)(
+  const encoder_state_t * const state, 
+  coeff_t *coef, 
+  coeff_t *q_coef, 
+  int32_t width,
+  int32_t height, 
+  color_t color, 
+  int8_t scan_idx, 
+  int8_t block_type, 
+  int8_t transform_skip, 
+  uint8_t lfnst_idx);
+
 typedef unsigned (quant_cbcr_func)(
  encoder_state_t* const state,
  const cu_info_t* const cur_cu,
  const int width,
+  const int height,
  const coeff_scan_order_t scan_order,
  const int in_stride, const int out_stride,
  const uvg_pixel* const u_ref_in,
@ -63,15 +74,18 @@ typedef unsigned (quant_cbcr_func)(
  bool early_skip,
  int lmcs_chroma_adj, 
  enum uvg_tree_type tree_type);
+
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
  const int in_stride, const int out_stride,
  const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
  uvg_pixel *rec_out, coeff_t *coeff_out,
  bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type);
+
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
  int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
+
 typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);

 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
--- a/src/transform.c
+++ b/src/transform.c
@ -77,6 +77,7 @@ const uint8_t uvg_g_chroma_scale[58]=
 * Parameters pred_in and rec_out may be aliased.
 *
 * \param width       Transform width.
+ * \param height      Transform height.
 * \param in_stride   Stride for ref_in and pred_in
 * \param out_stride  Stride for rec_out.
 * \param ref_in      Reference pixels.
@ -87,6 +88,7 @@ const uint8_t uvg_g_chroma_scale[58]=
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
 static bool bypass_transquant(const int width,
+                              const int height,
                              const int in_stride,
                              const int out_stride,
                              const uvg_pixel *const ref_in,
@ -96,7 +98,7 @@ static bool bypass_transquant(const int width,
 {
  bool nonzero_coeffs = false;

-  for (int y = 0; y < width; ++y) {
+  for (int y = 0; y < height; ++y) {
    for (int x = 0; x < width; ++x) {
      int32_t in_idx    = x + y * in_stride;
      int32_t out_idx   = x + y * out_stride;
@ -123,6 +125,7 @@ static bool bypass_transquant(const int width,
 * \param coeff   coefficients (residual) to filter
 */
 static void rdpcm(const int width,
+                  const int height,
                  const rdpcm_dir dir,
                  coeff_t *coeff)
 {
@ -130,7 +133,7 @@ static void rdpcm(const int width,
  const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
  const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;

-  for (int y = width - 1; y >= min_y; y--) {
+  for (int y = height - 1; y >= min_y; y--) {
    for (int x = width - 1; x >= min_x; x--) {
      const int index = x + y * width;
      coeff[index] -= coeff[index - offset];
@ -203,17 +206,18 @@ void uvg_derive_lfnst_constraints(

 /**
 * \brief NxN inverse transform (2D)
- * \param coeff input data (transform coefficients)
- * \param block output data (residual)
- * \param block_size input data (width of transform)
+ * \param coeff   input data (transform coefficients)
+ * \param block   output data (residual)
+ * \param width   transform width
+ * \param height  transform height
 */
-void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size)
+void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height)
 {
-  int32_t  j,k;
-  for (j = 0; j < block_size; j++) {
-    for(k = 0; k < block_size; k ++) {
+  int32_t j, k;
+  for (j = 0; j < height; j++) {
+    for(k = 0; k < width; k ++) {
      // Casting back and forth to make UBSan not trigger due to left-shifting negatives
-      coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k]));
+      coeff[j * width + k] = (int16_t)((uint16_t)(block[j * width + k]));
    }
  }
 }
@ -243,17 +247,18 @@ void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,
 void uvg_transform2d(const encoder_control_t * const encoder,
                     int16_t *block,
                     int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                     color_t color,
                     const cu_info_t *tu)
 {
-  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx)
+  if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx || block_width != block_height)
  {
-    uvg_mts_dct(encoder->bitdepth, color, tu, block_size, block, coeff, encoder->cfg.mts);
+    uvg_mts_dct(encoder->bitdepth, color, tu, block_width, block_height, block, coeff, encoder->cfg.mts);
  }
  else
  {
-    dct_func *dct_func = uvg_get_dct_func(block_size, color, tu->type);
+    dct_func *dct_func = uvg_get_dct_func(block_width, block_height, color, tu->type);
    dct_func(encoder->bitdepth, block, coeff);
  }
 }
@ -373,6 +378,7 @@ static void generate_jccr_transforms(
      &temp_resi[(cbf_mask1 - 1) * trans_offset],
      &u_coeff[*num_transforms * trans_offset],
      width,
+      height,
      COLOR_U,
      pred_cu
    );
@ -386,6 +392,7 @@ static void generate_jccr_transforms(
      &temp_resi[(cbf_mask2 - 1) * trans_offset],
      &u_coeff[*num_transforms * trans_offset],
      width,
+      height,
      COLOR_U,
      pred_cu
    );
@ -492,10 +499,10 @@ void uvg_chroma_transform_search(
  ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
  ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
  uvg_transform2d(
-    state->encoder_control, u_resi, u_coeff, width, COLOR_U, pred_cu
+    state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
  );
  uvg_transform2d(
-    state->encoder_control, v_resi, v_coeff, width, COLOR_V, pred_cu
+    state->encoder_control, v_resi, v_coeff, width, height, COLOR_V, pred_cu
  );
  enum uvg_chroma_transforms transforms[5];
  transforms[0] = DCT7_CHROMA;
@ -508,8 +515,8 @@ void uvg_chroma_transform_search(
    pred_cu->cr_lfnst_idx == 0 ;

  if (can_use_tr_skip) {
-    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width);
-    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width);
+    uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width, height);
+    uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width, height);
    transforms[num_transforms] = CHROMA_TS;
    num_transforms++;
  }
@ -1053,7 +1060,7 @@ void uvg_inv_lfnst(
 */
 int uvg_quantize_residual_trskip(
    encoder_state_t *const state,
-    const cu_info_t *const cur_cu, const int width, const color_t color,
+    const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
    const coeff_scan_order_t scan_order, int8_t *trskip_out, 
    const int in_stride, const int out_stride,
    const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, 
@ -1074,7 +1081,7 @@ int uvg_quantize_residual_trskip(
  //noskip.cost += uvg_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;

  skip.has_coeffs = uvg_quantize_residual(
-    state, cur_cu, width, color, scan_order,
+    state, cur_cu, width, height, color, scan_order,
    1, in_stride, width,
    ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj, 
    UVG_BOTH_T /* tree type doesn't matter for transformskip*/);
@ -1090,9 +1097,9 @@ int uvg_quantize_residual_trskip(
  if (best->has_coeffs || rec_out != pred_in) {
    // If there is no residual and reconstruction is already in rec_out, 
    // we can skip this.
-    uvg_pixels_blit(best->rec, rec_out, width, width, width, out_stride);
+    uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride);
  }
-  copy_coeffs(best->coeff, coeff_out, width);
+  copy_coeffs(best->coeff, coeff_out, width, height);

  return best->has_coeffs;
 }
@ -1131,8 +1138,8 @@ static void quantize_tr_residual(
  // This should ensure that the CBF data doesn't get corrupted if this function
  // is called more than once.

-  int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
-  int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int32_t tr_width  = color == COLOR_Y ? cu_loc->width  : cu_loc->chroma_width;
+  const int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
  
  const int32_t lcu_width = LCU_WIDTH >> shift;
  const int8_t mode =
@ -1183,7 +1190,9 @@ static void quantize_tr_residual(
  }

  if (cfg->lossless) {
+    // ISP_TODO: is there any sensible case where in and out strides would be different?
    has_coeffs = bypass_transquant(tr_width,
+                                   tr_height,
                                   lcu_width, // in stride
                                   lcu_width, // out stride
                                   ref,
@ -1193,9 +1202,9 @@ static void quantize_tr_residual(
    if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
      // implicit rdpcm for horizontal and vertical intra modes
      if (mode == 18) {
-        rdpcm(tr_width, RDPCM_HOR, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_HOR, coeff);
      } else if (mode == 50) {
-        rdpcm(tr_width, RDPCM_VER, coeff);
+        rdpcm(tr_width, tr_height, RDPCM_VER, coeff);
      }
    }

@ -1206,6 +1215,7 @@ static void quantize_tr_residual(
    has_coeffs = uvg_quantize_residual_trskip(state,
                                              cur_pu,
                                              tr_width,
+                                              tr_height,
                                              color,
                                              scan_idx,
                                              &tr_skip,
@ -1222,6 +1232,7 @@ static void quantize_tr_residual(
        state,
        cur_pu,
        tr_width,
+        tr_height,
        scan_idx,
        lcu_width,
        lcu_width,
@ -1240,6 +1251,7 @@ static void quantize_tr_residual(
    has_coeffs = uvg_quantize_residual(state,
                                       cur_pu,
                                       tr_width,
+                                       tr_height,
                                       color,
                                       scan_idx,
                                       false, // tr skip
@ -1326,8 +1338,8 @@ void uvg_quantize_lcu_residual(
    const int offset = width / 2;
    for (int j = 0; j < 2; ++j) {
      for (int i = 0; i < 2; ++i) {
-        const cu_loc_t loc;
-        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width, height);
+        cu_loc_t loc;
+        uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width >> 1, height >> 1);
        // jccr is currently not supported if transform is split
        uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, depth + 1, NULL, lcu, early_skip, tree_type);
      }
--- a/src/transform.h
+++ b/src/transform.h
@ -47,13 +47,14 @@ extern const uint8_t uvg_g_chroma_scale[58];
 extern const int16_t uvg_g_inv_quant_scales[6];
 extern const int16_t uvg_g_quant_scales[6];

-void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
+void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
 void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);

 void uvg_transform2d(const encoder_control_t * const encoder,
                     int16_t *block,
                     int16_t *coeff,
-                     int8_t block_size,
+                     int8_t block_width,
+                     int8_t block_height,
                     color_t color,
                     const cu_info_t *tu);

--- a/tests/mts_tests.c
+++ b/tests/mts_tests.c
@ -111,7 +111,7 @@ static void setup_tests()
          tu.tr_idx = MTS_DST7_DST7 + trafo;
          tu.lfnst_idx = 0;
          tu.cr_lfnst_idx = 0;
-          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
+          mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
        }
      }      
    }
@ -167,7 +167,7 @@ TEST dct(void)
      int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
      ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };

-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);

      for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
        ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
@ -192,7 +192,7 @@ TEST idct(void)
      int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
      ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };

-      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
+      test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);

      for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
        ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);