Check zero coefficient cost for inter

Checks the cost of flushing all coefficients of an inter block to zero. This is much faster than doing full RDOQ but can still reduce bitrate significantly. Encoding speed is increased since fewer coefficient bits have to be coded with CABAC.
2024-11-27 19:24:06 +00:00 · 2018-01-17 13:39:20 +02:00 · 2018-01-17 13:39:20 +02:00 · 8c53417006
parent 018b5ffa64
commit 8c53417006
1 changed files with 49 additions and 2 deletions
--- a/src/search.c
+++ b/src/search.c
@ -392,6 +392,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  const videoframe_t * const frame = state->tile->frame;
  int cu_width = LCU_WIDTH >> depth;
  double cost = MAX_INT;
+  double inter_zero_coeff_cost = MAX_INT;
  uint32_t inter_bitcost = MAX_INT;
  cu_info_t *cur_cu;

@ -518,7 +519,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        // rd2. Possibly because the luma mode search already takes chroma
        // into account, so there is less of a chanse of luma mode being
        // really bad for chroma.
-        if (state->encoder_control->cfg.rdo == 3) {
+        if (ctrl->cfg.rdo == 3) {
          cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
          lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
        }
@ -540,6 +541,30 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

      kvz_inter_recon_cu(state, lcu, x, y, cu_width);

+      if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        const int luma_index   = y_local * LCU_WIDTH + x_local;
+        const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
+
+        double ssd = 0.0;
+        ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
+          LCU_WIDTH, LCU_WIDTH, cu_width
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+
+        inter_zero_coeff_cost = ssd + inter_bitcost * state->lambda;
+
+        // Save the pixels at a lower level of the working tree.
+        copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]);
+      }
+
      const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
      kvz_quantize_lcu_residual(state,
                                true, has_chroma,
@ -549,7 +574,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

      int cbf = cbf_is_set_any(cur_cu->cbf, depth);

-      if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+      if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
        cur_cu->merged = 0;
        cur_cu->skipped = 1;
        // Selecting skip reduces bits needed to code the CU
@ -575,6 +600,28 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    }

    cost += mode_bits * state->lambda;
+
+    if (inter_zero_coeff_cost <= cost) {
+      cost = inter_zero_coeff_cost;
+
+      // Restore saved pixels from lower level of the working tree.
+      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu);
+
+      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+        cur_cu->merged = 0;
+        cur_cu->skipped = 1;
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      }
+
+      if (cur_cu->tr_depth != depth) {
+        // Reset transform depth since there are no coefficients. This
+        // ensures that CBF is cleared for the whole area of the CU.
+        kvz_lcu_set_trdepth(lcu, x, y, depth, depth);
+      }
+
+      cur_cu->cbf = 0;
+      lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
+    }
  }

  bool can_split_cu =