From 8c534170064cabd28dbe6d16ff37bba5793d94a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 17 Jan 2018 13:39:20 +0200
Subject: [PATCH] Check zero coefficient cost for inter

Checks the cost of flushing all coefficients of an inter block to zero.
This is much faster than doing full RDOQ but can still reduce bitrate
significantly. Encoding speed is increased since fewer coefficient bits
have to be coded with CABAC.
---
 src/search.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 31aafc87..c02aeebe 100644
--- a/src/search.c
+++ b/src/search.c
@@ -392,6 +392,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
+  double inter_zero_coeff_cost = MAX_INT;
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
@@ -518,7 +519,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
-        if (state->encoder_control->cfg.rdo == 3) {
+        if (ctrl->cfg.rdo == 3) {
           cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
@@ -540,6 +541,30 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
       kvz_inter_recon_cu(state, lcu, x, y, cu_width);
 
+      if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        const int luma_index   = y_local * LCU_WIDTH + x_local;
+        const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
+
+        double ssd = 0.0;
+        ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
+          LCU_WIDTH, LCU_WIDTH, cu_width
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+
+        inter_zero_coeff_cost = ssd + inter_bitcost * state->lambda;
+
+        // Save the pixels at a lower level of the working tree.
+        copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]);
+      }
+
       const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
       kvz_quantize_lcu_residual(state,
                                 true, has_chroma,
@@ -549,7 +574,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
       int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
-      if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+      if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
         cur_cu->merged = 0;
         cur_cu->skipped = 1;
         // Selecting skip reduces bits needed to code the CU
@@ -575,6 +600,28 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     }
 
     cost += mode_bits * state->lambda;
+
+    if (inter_zero_coeff_cost <= cost) {
+      cost = inter_zero_coeff_cost;
+
+      // Restore saved pixels from lower level of the working tree.
+      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu);
+
+      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+        cur_cu->merged = 0;
+        cur_cu->skipped = 1;
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      }
+
+      if (cur_cu->tr_depth != depth) {
+        // Reset transform depth since there are no coefficients. This
+        // ensures that CBF is cleared for the whole area of the CU.
+        kvz_lcu_set_trdepth(lcu, x, y, depth, depth);
+      }
+
+      cur_cu->cbf = 0;
+      lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
+    }
   }
 
   bool can_split_cu =