From ae0336fdfc31511680af371960ae7841f68ffaba Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 18 Aug 2022 15:07:22 +0300
Subject: [PATCH] [isp] Add non-square block handling to functions.

---
 src/context.c                                 | 10 +++----
 src/context.h                                 |  2 +-
 src/encode_coding_tree.c                      | 28 +++++++++++--------
 src/encode_coding_tree.h                      |  1 +
 src/rdo.c                                     | 12 +++++---
 src/rdo.h                                     |  1 +
 src/search.c                                  | 26 +++++++++--------
 src/strategies/avx2/encode_coding_tree-avx2.h |  1 +
 src/strategies/avx2/quant-avx2.c              |  5 ++--
 .../generic/encode_coding_tree-generic.c      | 10 +++----
 .../generic/encode_coding_tree-generic.h      |  1 +
 src/strategies/generic/quant-generic.c        |  5 ++--
 src/strategies/strategies-encode.h            |  1 +
 src/strategies/strategies-quant.h             |  2 +-
 src/transform.c                               |  2 ++
 15 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/src/context.c b/src/context.c
index 31124b02..708b9da4 100644
--- a/src/context.c
+++ b/src/context.c
@@ -657,7 +657,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
 * \returns context index for current scan position
 */
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t width, uint32_t height, int8_t type,
+                                         uint32_t width, uint32_t height, int8_t color,
                                          int32_t* temp_diag, int32_t* temp_sum)
 {
   const coeff_t* data = coeff + pos_x + pos_y * width;
@@ -687,7 +687,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u
   }
 #undef UPDATE
   int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0);
-  if (type == 0 /* Luma */)
+  if (color == COLOR_Y)
   {
     ctx_ofs += diag < 5 ? 4 : 0;
   }
@@ -815,7 +815,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos
 * \returns context go rice parameter
 */
 uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel)
+                             uint32_t width, uint32_t height, uint32_t baselevel)
 {
 #define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/
 
@@ -857,8 +857,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
 * \returns context go rice parameter
 */
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-  uint32_t height, uint32_t width, uint32_t baselevel)
+  uint32_t width, uint32_t height, uint32_t baselevel)
 {
-  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel);
+  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel);
   return  g_go_rice_pars[check];  
 }
\ No newline at end of file
diff --git a/src/context.h b/src/context.h
index 3f342409..f083e44c 100644
--- a/src/context.h
+++ b/src/context.h
@@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
                      uint32_t height, uint32_t width, uint32_t baselevel);
 
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel);
+                             uint32_t width, uint32_t height, uint32_t baselevel);
 
 #define CNU 35
 #define DWS 8
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 019c1d03..f917b31d 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -213,6 +213,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
   const coeff_t* coeff,
   uint32_t width,
+  uint32_t height,
   uint8_t type,
   int8_t scan_mode,
   double* bits_out) 
@@ -227,8 +228,9 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   // CONSTANTS
 
-  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
-  const uint32_t log2_block_height = log2_block_width; // TODO: height
+  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  // TODO: log2_cg_size is wrong if width != height
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
   const uint32_t* old_scan =    uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
   const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
@@ -243,13 +245,11 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 
   cabac->cur_ctx = base_coeff_group_ctx;
   
-  // ISP_TODO: height
-  int maxCtxBins = (width * width * 7) >> 2;
+  int maxCtxBins = (width * height * 7) >> 2;
   unsigned scan_cg_last = (unsigned )-1;
   //unsigned scan_pos_last = (unsigned )-1;
 
-  // ISP_TODO: height
-  for (i = 0; i < width * width; i++) {
+  for (i = 0; i < width * height; i++) {
     if (coeff[scan[i]]) {
       // ISP_DEBUG
       assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
@@ -258,7 +258,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
       sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
     }
   }
-  scan_cg_last = (width * width - 1) >> log2_cg_size;
+  // TODO: this won't work with non-square blocks
+  scan_cg_last = (width * height - 1) >> log2_cg_size;
   const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
 
   bool no_sig_group_before_last = true;
@@ -481,6 +482,7 @@ static void encode_chroma_tu(
   enum
   uvg_tree_type tree_type)
 {
+  int height_c = width_c; // TODO: height for non-square blocks
   int x_local = ((x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   int y_local = ((y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
@@ -496,7 +498,7 @@ static void encode_chroma_tu(
         // TODO: transform skip for chroma blocks
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, height_c, COLOR_U, *scan_idx, cur_pu, NULL);
     }
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
@@ -504,7 +506,7 @@ static void encode_chroma_tu(
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
     }
   }
   else {
@@ -513,7 +515,7 @@ static void encode_chroma_tu(
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
-    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
     
   }
 }
@@ -534,6 +536,9 @@ static void encode_transform_unit(
   cabac_data_t* const cabac = &state->cabac;
   const uint8_t width = LCU_WIDTH >> depth;
   const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
+  // TODO: height for non-square blocks
+  const uint8_t height = width;
+  const uint8_t height_c = width_c;
 
   cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
   const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y);
@@ -556,13 +561,14 @@ static void encode_transform_unit(
       DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
     }
     if(cur_pu->tr_idx == MTS_SKIP) {
-      uvg_encode_ts_residual(state, cabac, coeff_y, width, 0, scan_idx, NULL);      
+      uvg_encode_ts_residual(state, cabac, coeff_y, width, height, 0, scan_idx, NULL);      
     }
     else {
       uvg_encode_coeff_nxn(state,
                            cabac,
                            coeff_y,
                            width,
+                           height,
                            0,
                            scan_idx,
                            (cu_info_t * )cur_pu,
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index c2cd39da..9757a327 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -64,6 +64,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
   const coeff_t* coeff,
   uint32_t width,
+  uint32_t height,
   uint8_t type,
   int8_t scan_mode,
   double* bits);
diff --git a/src/rdo.c b/src/rdo.c
index fc4052c4..9f5abd21 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -298,6 +298,7 @@ static INLINE double get_coeff_cabac_cost(
   const encoder_state_t * const state,
   const coeff_t *coeff,
   int32_t width,
+  int32_t height,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip,
@@ -305,7 +306,7 @@ static INLINE double get_coeff_cabac_cost(
 {
   // Make sure there are coeffs present
   bool found = false;
-  for (int i = 0; i < width*width; i++) {
+  for (int i = 0; i < width * height; i++) {
     if (coeff[i] != 0) {
       found = 1;
       break;
@@ -331,6 +332,7 @@ static INLINE double get_coeff_cabac_cost(
                          &cabac_copy,
                          coeff,
                          width,
+                         height,
                          color,
                          scan_mode,
                          cur_tu,                   
@@ -341,6 +343,7 @@ static INLINE double get_coeff_cabac_cost(
       &cabac_copy,
       coeff,
       width,
+      height,
       color,
       scan_mode,
       &bits);
@@ -392,6 +395,7 @@ double uvg_get_coeff_cost(
   const coeff_t *coeff,
   cu_info_t* cur_tu,
   int32_t width,
+  int32_t height,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip)
@@ -409,15 +413,15 @@ double uvg_get_coeff_cost(
       return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
     } else {
       uint64_t weights = uvg_fast_coeff_get_weights(state);
-      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
+      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, height, weights);
       if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
     if (save_cccs) {
       save_ccc(state->qp, coeff, width * width, ccc);
     }
diff --git a/src/rdo.h b/src/rdo.h
index 7f325cfd..88a6548b 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -74,6 +74,7 @@ double uvg_get_coeff_cost(
   const coeff_t *coeff,
   cu_info_t* cur_tu,
   int32_t width,
+  int32_t height,
   color_t color,
   int8_t scan_mode,
   int8_t tr_skip);
diff --git a/src/search.c b/src/search.c
index 64dd263b..ba2f79c9 100644
--- a/src/search.c
+++ b/src/search.c
@@ -310,7 +310,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                            lcu_t *const lcu,
                            uint8_t isp_cbf)
 {
-  const int width = LCU_WIDTH >> depth;
+  const int width  = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
@@ -380,7 +381,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
   }
 
   double bits = tr_tree_bits + coeff_bits;
@@ -394,7 +395,8 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
                              lcu_t *const lcu)
 {
   const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
-  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  const int width  = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
 
@@ -468,11 +470,11 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
     const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
     if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, 2, scan_order, 0);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, height, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, height, 2, scan_order, 0);
     }
     else {
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, height, 2, scan_order, 0);
       
     }
   }
@@ -493,6 +495,7 @@ static double cu_rd_cost_tr_split_accurate(
   enum uvg_tree_type tree_type,
   uint8_t isp_cbf) {
   const int width = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
 
   const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
   // cur_cu is used for TU parameters.
@@ -597,7 +600,7 @@ static double cu_rd_cost_tr_split_accurate(
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
-    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, height, 0, luma_scan_mode, tr_cu->tr_skip & 1);
   }
 
   if(depth == 4 || tree_type == UVG_LUMA_T) {
@@ -624,7 +627,8 @@ static double cu_rd_cost_tr_split_accurate(
   unsigned chroma_ssd = 0;
   if(has_chroma) {
     const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
-    const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
+    const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
+    const int chroma_height = chroma_width; // TODO: height for non-square blocks
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
     const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
@@ -646,8 +650,8 @@ static double cu_rd_cost_tr_split_accurate(
       if(chroma_can_use_tr_skip && cb_flag_v) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, tr_cu->tr_skip & 2);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, chroma_height, COLOR_V, scan_order, tr_cu->tr_skip & 4);
       
     }
     else {
@@ -664,7 +668,7 @@ static double cu_rd_cost_tr_split_accurate(
       if (chroma_can_use_tr_skip) {
         CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
       }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, COLOR_U, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, 0);
     }
   }
 
diff --git a/src/strategies/avx2/encode_coding_tree-avx2.h b/src/strategies/avx2/encode_coding_tree-avx2.h
index ae1845c8..9fc75c8a 100644
--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@@ -45,6 +45,7 @@ void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                                cabac_data_t * const cabac,
                                const coeff_t *coeff,
                                uint8_t width,
+                               uint8_t height,
                                uint8_t type,
                                int8_t scan_mode,
                                int8_t tr_skip,
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 078df533..962a671a 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -875,8 +875,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
   return parts[0] + parts[1] + parts[2] + parts[3];
 }
 
-static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
   const __m256i zero           = _mm256_setzero_si256();
   const __m256i threes         = _mm256_set1_epi16(3);
   const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
@@ -893,7 +894,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64
   __m256i wts_lo     = _mm256_broadcastsi128_si256(wts_lo_128);
   __m256i wts_hi     = _mm256_broadcastsi128_si256(wts_hi_128);
 
-  for (int i = 0; i < width * width; i += 32) {
+  for (int i = 0; i < width * height; i += 32) {
     __m256i curr_lo      = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
     __m256i curr_abs_lo  = _mm256_abs_epi16   (curr_lo);
     __m256i curr_max3_lo = _mm256_min_epu16   (curr_abs_lo, threes);
diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c
index 189334b5..21785501 100644
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@@ -55,6 +55,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
   cabac_data_t * const cabac,
   const coeff_t *coeff,
   uint8_t width,
+  uint8_t height,
   uint8_t color,
   int8_t scan_mode,
   cu_info_t* cur_cu,
@@ -75,7 +76,6 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
   // CONSTANTS
 
-  const int height = width; // TODO: height for non-square blocks.
   const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
   const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
@@ -192,7 +192,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
 
         sig = (coeff[blk_pos] != 0) ? 1 : 0;
         if (num_non_zero || next_sig_pos != infer_sig_pos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
           cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]);
           cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]);
 
@@ -200,7 +200,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
           reg_bins--;
 
         } else if (next_sig_pos != scan_pos_last) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
         }
 
 
@@ -266,7 +266,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
         blk_pos = scan[scan_pos];
         pos_y = blk_pos / width;
         pos_x = blk_pos - (pos_y * width);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4);
 
         rice_param = g_go_rice_pars[abs_sum];
         uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]);
@@ -284,7 +284,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
         pos_y = blk_pos / width;
         pos_x = blk_pos - (pos_y * width);
         uint32_t coeff_abs = abs(coeff[blk_pos]);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0);
         rice_param = g_go_rice_pars[abs_sum];        
         pos0 = ((quant_state<2)?1:2) << rice_param;
         uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs);
diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h
index 8cfe497d..bcf51f15 100644
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@@ -45,6 +45,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                   cabac_data_t * const cabac,
                                   const coeff_t *coeff,
                                   uint8_t width,
+                                  uint8_t height,
                                   uint8_t color,
                                   int8_t scan_mode,
                                   cu_info_t* cur_cu,
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index deb5c962..16fbce38 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -653,14 +653,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
   weights[3] = (wts_packed >> 48) & 0xffff;
 }
 
-static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
   uint32_t sum = 0;
   uint16_t weights_unpacked[4];
 
   get_coeff_weights(weights, weights_unpacked);
 
-  for (int32_t i = 0; i < width * width; i++) {
+  for (int32_t i = 0; i < width * height; i++) {
      int16_t curr = coeff[i];
     uint32_t curr_abs = abs(curr);
     if (curr_abs > 3) {
diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h
index 8743a6ed..f503eb73 100644
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@@ -50,6 +50,7 @@ typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                          cabac_data_t * const cabac,
                                          const coeff_t *coeff,
                                          uint8_t width,
+                                         uint8_t heigth,
                                          uint8_t color,
                                          int8_t scan_mode,
                                          cu_info_t* cur_cu,
diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h
index 2920ed82..b0e75046 100644
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@@ -86,7 +86,7 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
   int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
 
-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
+typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
 
diff --git a/src/transform.c b/src/transform.c
index 01f6289f..4738f942 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -690,6 +690,7 @@ void uvg_chroma_transform_search(
         u_quant_coeff,
         pred_cu,
         width,
+        height,
         COLOR_U,
         scan_order,
         transforms[i] == CHROMA_TS);
@@ -706,6 +707,7 @@ void uvg_chroma_transform_search(
         v_quant_coeff,
         pred_cu,
         width,
+        height,
         COLOR_V,
         scan_order,
         transforms[i] == CHROMA_TS);