[isp] Add non-square block handling to functions.

2024-11-23 18:14:06 +00:00 · 2022-08-18 15:07:22 +03:00 · 2022-08-18 15:07:22 +03:00 · ae0336fdfc
parent 031a758d6c
commit ae0336fdfc
15 changed files with 65 additions and 42 deletions
--- a/src/context.c
+++ b/src/context.c
@ -657,7 +657,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
 * \returns context index for current scan position
 */
 uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                                         uint32_t width, uint32_t height, int8_t type,
+                                         uint32_t width, uint32_t height, int8_t color,
                                         int32_t* temp_diag, int32_t* temp_sum)
 {
  const coeff_t* data = coeff + pos_x + pos_y * width;
@ -687,7 +687,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u
  }
 #undef UPDATE
  int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0);
-  if (type == 0 /* Luma */)
+  if (color == COLOR_Y)
  {
    ctx_ofs += diag < 5 ? 4 : 0;
  }
@ -815,7 +815,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos
 * \returns context go rice parameter
 */
 uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel)
+                             uint32_t width, uint32_t height, uint32_t baselevel)
 {
 #define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/

@ -857,8 +857,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
 * \returns context go rice parameter
 */
 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-  uint32_t height, uint32_t width, uint32_t baselevel)
+  uint32_t width, uint32_t height, uint32_t baselevel)
 {
-  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel);
+  uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel);
  return  g_go_rice_pars[check];  
 }
--- a/src/context.h
+++ b/src/context.h
@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
                     uint32_t height, uint32_t width, uint32_t baselevel);

 uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
-                             uint32_t height, uint32_t width, uint32_t baselevel);
+                             uint32_t width, uint32_t height, uint32_t baselevel);

 #define CNU 35
 #define DWS 8
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -213,6 +213,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
  cabac_data_t* const cabac,
  const coeff_t* coeff,
  uint32_t width,
+  uint32_t height,
  uint8_t type,
  int8_t scan_mode,
  double* bits_out) 
@ -228,7 +229,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
  // CONSTANTS

  const uint32_t log2_block_width  = uvg_g_convert_to_log2[width];
-  const uint32_t log2_block_height = log2_block_width; // TODO: height
+  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  // TODO: log2_cg_size is wrong if width != height
  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
  const uint32_t* old_scan =    uvg_g_sig_last_scan[scan_mode][log2_block_width - 1];
  const uint32_t* old_scan_cg = g_sig_last_scan_cg[log2_block_width - 1][scan_mode];
@ -243,13 +245,11 @@ void uvg_encode_ts_residual(encoder_state_t* const state,

  cabac->cur_ctx = base_coeff_group_ctx;
  
-  // ISP_TODO: height
-  int maxCtxBins = (width * width * 7) >> 2;
+  int maxCtxBins = (width * height * 7) >> 2;
  unsigned scan_cg_last = (unsigned )-1;
  //unsigned scan_pos_last = (unsigned )-1;

-  // ISP_TODO: height
-  for (i = 0; i < width * width; i++) {
+  for (i = 0; i < width * height; i++) {
    if (coeff[scan[i]]) {
      // ISP_DEBUG
      assert(old_scan[i] == scan[i] && "Old scan_cg differs from the new one.");
@ -258,7 +258,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
      sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
    }
  }
-  scan_cg_last = (width * width - 1) >> log2_cg_size;
+  // TODO: this won't work with non-square blocks
+  scan_cg_last = (width * height - 1) >> log2_cg_size;
  const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));

  bool no_sig_group_before_last = true;
@ -481,6 +482,7 @@ static void encode_chroma_tu(
  enum
  uvg_tree_type tree_type)
 {
+  int height_c = width_c; // TODO: height for non-square blocks
  int x_local = ((x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
  int y_local = ((y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C;
  cabac_data_t* const cabac = &state->cabac;
@ -496,7 +498,7 @@ static void encode_chroma_tu(
        // TODO: transform skip for chroma blocks
        CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
      }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, height_c, COLOR_U, *scan_idx, cur_pu, NULL);
    }

    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
@ -504,7 +506,7 @@ static void encode_chroma_tu(
        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
        CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
      }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
    }
  }
  else {
@ -513,7 +515,7 @@ static void encode_chroma_tu(
      cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
      CABAC_BIN(cabac, 0, "transform_skip_flag");
    }
-    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, cur_pu, NULL);
+    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, height_c, COLOR_V, *scan_idx, cur_pu, NULL);
    
  }
 }
@ -534,6 +536,9 @@ static void encode_transform_unit(
  cabac_data_t* const cabac = &state->cabac;
  const uint8_t width = LCU_WIDTH >> depth;
  const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
+  // TODO: height for non-square blocks
+  const uint8_t height = width;
+  const uint8_t height_c = width_c;

  cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
  const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y);
@ -556,13 +561,14 @@ static void encode_transform_unit(
      DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0);
    }
    if(cur_pu->tr_idx == MTS_SKIP) {
-      uvg_encode_ts_residual(state, cabac, coeff_y, width, 0, scan_idx, NULL);      
+      uvg_encode_ts_residual(state, cabac, coeff_y, width, height, 0, scan_idx, NULL);      
    }
    else {
      uvg_encode_coeff_nxn(state,
                           cabac,
                           coeff_y,
                           width,
+                           height,
                           0,
                           scan_idx,
                           (cu_info_t * )cur_pu,
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@ -64,6 +64,7 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
  cabac_data_t* const cabac,
  const coeff_t* coeff,
  uint32_t width,
+  uint32_t height,
  uint8_t type,
  int8_t scan_mode,
  double* bits);
--- a/src/rdo.c
+++ b/src/rdo.c
@ -298,6 +298,7 @@ static INLINE double get_coeff_cabac_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
  int32_t width,
+  int32_t height,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip,
@ -305,7 +306,7 @@ static INLINE double get_coeff_cabac_cost(
 {
  // Make sure there are coeffs present
  bool found = false;
-  for (int i = 0; i < width*width; i++) {
+  for (int i = 0; i < width * height; i++) {
    if (coeff[i] != 0) {
      found = 1;
      break;
@ -331,6 +332,7 @@ static INLINE double get_coeff_cabac_cost(
                         &cabac_copy,
                         coeff,
                         width,
+                         height,
                         color,
                         scan_mode,
                         cur_tu,                   
@ -341,6 +343,7 @@ static INLINE double get_coeff_cabac_cost(
      &cabac_copy,
      coeff,
      width,
+      height,
      color,
      scan_mode,
      &bits);
@ -392,6 +395,7 @@ double uvg_get_coeff_cost(
  const coeff_t *coeff,
  cu_info_t* cur_tu,
  int32_t width,
+  int32_t height,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip)
@ -409,15 +413,15 @@ double uvg_get_coeff_cost(
      return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
    } else {
      uint64_t weights = uvg_fast_coeff_get_weights(state);
-      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
+      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, height, weights);
      if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
        save_accuracy(state->qp, ccc, fast_cost);
      }
      return fast_cost;
    }
  } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff, width, height, color, scan_mode, tr_skip, cur_tu);
    if (save_cccs) {
      save_ccc(state->qp, coeff, width * width, ccc);
    }
--- a/src/rdo.h
+++ b/src/rdo.h
@ -74,6 +74,7 @@ double uvg_get_coeff_cost(
  const coeff_t *coeff,
  cu_info_t* cur_tu,
  int32_t width,
+  int32_t height,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip);
--- a/src/search.c
+++ b/src/search.c
@ -311,6 +311,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                           uint8_t isp_cbf)
 {
  const int width  = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;

@ -380,7 +381,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
    const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];

-    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, height, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP);
  }

  double bits = tr_tree_bits + coeff_bits;
@ -395,6 +396,7 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
 {
  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
  const int width  = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks
  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);

@ -468,11 +470,11 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);

    if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, 2, scan_order, 0);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, height, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, height, 2, scan_order, 0);
    }
    else {
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, height, 2, scan_order, 0);
      
    }
  }
@ -493,6 +495,7 @@ static double cu_rd_cost_tr_split_accurate(
  enum uvg_tree_type tree_type,
  uint8_t isp_cbf) {
  const int width = LCU_WIDTH >> depth;
+  const int height = width; // TODO: height for non-square blocks

  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
  // cur_cu is used for TU parameters.
@ -597,7 +600,7 @@ static double cu_rd_cost_tr_split_accurate(
    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];

-    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, height, 0, luma_scan_mode, tr_cu->tr_skip & 1);
  }

  if(depth == 4 || tree_type == UVG_LUMA_T) {
@ -625,6 +628,7 @@ static double cu_rd_cost_tr_split_accurate(
  if(has_chroma) {
    const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
    const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
+    const int chroma_height = chroma_width; // TODO: height for non-square blocks
    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
    const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);

@ -646,8 +650,8 @@ static double cu_rd_cost_tr_split_accurate(
      if(chroma_can_use_tr_skip && cb_flag_v) {
        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
      }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2);
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, tr_cu->tr_skip & 2);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, chroma_height, COLOR_V, scan_order, tr_cu->tr_skip & 4);
      
    }
    else {
@ -664,7 +668,7 @@ static double cu_rd_cost_tr_split_accurate(
      if (chroma_can_use_tr_skip) {
        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
      }
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, COLOR_U, scan_order, 0);
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, chroma_height, COLOR_U, scan_order, 0);
    }
  }

--- a/src/strategies/avx2/encode_coding_tree-avx2.h
+++ b/src/strategies/avx2/encode_coding_tree-avx2.h
@ -45,6 +45,7 @@ void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
                               cabac_data_t * const cabac,
                               const coeff_t *coeff,
                               uint8_t width,
+                               uint8_t height,
                               uint8_t type,
                               int8_t scan_mode,
                               int8_t tr_skip,
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -875,8 +875,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
  return parts[0] + parts[1] + parts[2] + parts[3];
 }

-static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
  const __m256i zero           = _mm256_setzero_si256();
  const __m256i threes         = _mm256_set1_epi16(3);
  const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
@ -893,7 +894,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64
  __m256i wts_lo     = _mm256_broadcastsi128_si256(wts_lo_128);
  __m256i wts_hi     = _mm256_broadcastsi128_si256(wts_hi_128);

-  for (int i = 0; i < width * width; i += 32) {
+  for (int i = 0; i < width * height; i += 32) {
    __m256i curr_lo      = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
    __m256i curr_abs_lo  = _mm256_abs_epi16   (curr_lo);
    __m256i curr_max3_lo = _mm256_min_epu16   (curr_abs_lo, threes);
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@ -55,6 +55,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
  cabac_data_t * const cabac,
  const coeff_t *coeff,
  uint8_t width,
+  uint8_t height,
  uint8_t color,
  int8_t scan_mode,
  cu_info_t* cur_cu,
@ -75,7 +76,6 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,

  // CONSTANTS

-  const int height = width; // TODO: height for non-square blocks.
  const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_width][1];
@ -192,7 +192,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,

        sig = (coeff[blk_pos] != 0) ? 1 : 0;
        if (num_non_zero || next_sig_pos != infer_sig_pos) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
          cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]);
          cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]);

@ -200,7 +200,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
          reg_bins--;

        } else if (next_sig_pos != scan_pos_last) {
-          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
+          ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
        }


@ -266,7 +266,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
        blk_pos = scan[scan_pos];
        pos_y = blk_pos / width;
        pos_x = blk_pos - (pos_y * width);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4);

        rice_param = g_go_rice_pars[abs_sum];
        uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]);
@ -284,7 +284,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
        pos_y = blk_pos / width;
        pos_x = blk_pos - (pos_y * width);
        uint32_t coeff_abs = abs(coeff[blk_pos]);
-        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0);
+        int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0);
        rice_param = g_go_rice_pars[abs_sum];        
        pos0 = ((quant_state<2)?1:2) << rice_param;
        uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs);
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@ -45,6 +45,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                  cabac_data_t * const cabac,
                                  const coeff_t *coeff,
                                  uint8_t width,
+                                  uint8_t height,
                                  uint8_t color,
                                  int8_t scan_mode,
                                  cu_info_t* cur_cu,
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -653,14 +653,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
  weights[3] = (wts_packed >> 48) & 0xffff;
 }

-static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
 {
+  assert((width == height) && "Non-square block handling not implemented for this function.");
  uint32_t sum = 0;
  uint16_t weights_unpacked[4];

  get_coeff_weights(weights, weights_unpacked);

-  for (int32_t i = 0; i < width * width; i++) {
+  for (int32_t i = 0; i < width * height; i++) {
     int16_t curr = coeff[i];
    uint32_t curr_abs = abs(curr);
    if (curr_abs > 3) {
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@ -50,6 +50,7 @@ typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                         cabac_data_t * const cabac,
                                         const coeff_t *coeff,
                                         uint8_t width,
+                                         uint8_t heigth,
                                         uint8_t color,
                                         int8_t scan_mode,
                                         cu_info_t* cur_cu,
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@ -86,7 +86,7 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
  int32_t height, color_t color, int8_t block_type, int8_t transform_skip);

-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
+typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights);

 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

--- a/src/transform.c
+++ b/src/transform.c
@ -690,6 +690,7 @@ void uvg_chroma_transform_search(
        u_quant_coeff,
        pred_cu,
        width,
+        height,
        COLOR_U,
        scan_order,
        transforms[i] == CHROMA_TS);
@ -706,6 +707,7 @@ void uvg_chroma_transform_search(
        v_quant_coeff,
        pred_cu,
        width,
+        height,
        COLOR_V,
        scan_order,
        transforms[i] == CHROMA_TS);