From 018b5ffa64e81d1e4bda0acaaaccc11d2083824c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 17 Jan 2018 13:09:47 +0200
Subject: [PATCH 1/2] Move inter CU reconstruction to a new function

Moves code for reconstructing all PUs in an inter CU to a new function
kvz_inter_recon_cu in inter.c.
---
 src/inter.c        | 125 ++++++++++++++++++++++++++++++++++-----------
 src/inter.h        |  33 +++++-------
 src/search.c       |  42 +--------------
 src/search_inter.c |  24 ++++-----
 4 files changed, 121 insertions(+), 103 deletions(-)

diff --git a/src/inter.c b/src/inter.c
index 51cf1f05..ce9dca48 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -306,27 +306,27 @@ static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride,
 
 
 /**
- * \brief Reconstruct inter block
+ * \brief Reconstruct an inter PU using uniprediction.
  *
  * \param state         encoder state
  * \param ref           picture to copy the data from
- * \param xpos          block x position
- * \param ypos          block y position
- * \param width         block width
- * \param height        block height
+ * \param xpos          PU x position
+ * \param ypos          PU y position
+ * \param width         PU width
+ * \param height        PU height
  * \param mv_param      motion vector
  * \param lcu           destination lcu
- * \param hi_prec_out   destination of high precision output (null if not needed)
+ * \param hi_prec_out   destination of high precision output, or NULL if not needed
 */
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * const ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param[2],
-                         lcu_t *lcu,
-                         hi_prec_buf_t *hi_prec_out)
+static void inter_recon_unipred(const encoder_state_t * const state,
+                                const kvz_picture * const ref,
+                                int32_t xpos,
+                                int32_t ypos,
+                                int32_t width,
+                                int32_t height,
+                                const int16_t mv_param[2],
+                                lcu_t *lcu,
+                                hi_prec_buf_t *hi_prec_out)
 {
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
@@ -428,27 +428,27 @@ void kvz_inter_recon_lcu(const encoder_state_t * const state,
 }
 
 /**
- * \brief Reconstruct bi-pred inter block
+ * \brief Reconstruct bi-pred inter PU
  *
  * \param state     encoder state
  * \param ref1      reference picture to copy the data from
  * \param ref2      other reference picture to copy the data from
- * \param xpos      block x position
- * \param ypos      block y position
- * \param width     block width
- * \param height    block height
+ * \param xpos      PU x position
+ * \param ypos      PU y position
+ * \param width     PU width
+ * \param height    PU height
  * \param mv_param  motion vectors
  * \param lcu       destination lcu
  */
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param[2][2],
-                                lcu_t* lcu)
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param[2][2],
+                            lcu_t* lcu)
 {
   kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH];
   kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C];
@@ -468,7 +468,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
   if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
   if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
   //Reconstruct both predictors
-  kvz_inter_recon_lcu(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0);
+  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0);
   if (!hi_prec_luma_rec0){
     memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64);
   }
@@ -476,7 +476,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
     memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32);
     memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32);
   }
-  kvz_inter_recon_lcu(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1);
+  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
   for (temp_y = 0; temp_y < height; ++temp_y) {
@@ -506,6 +506,69 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
   if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
 }
 
+
+/**
+ * Reconstruct a single CU.
+ *
+ * The CU may consist of multiple PUs, each of which can use either
+ * uniprediction or biprediction.
+ *
+ * \param state   encoder state
+ * \param lcu     containing LCU
+ * \param x       x-coordinate of the CU in pixels
+ * \param y       y-coordinate of the CU in pixels
+ * \param width   CU width
+ */
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width)
+{
+  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+
+  const int num_pu = kvz_part_mode_num_parts[cu->part_size];
+  for (int i = 0; i < num_pu; ++i) {
+    const int pu_x = PU_GET_X(cu->part_size, width, x, i);
+    const int pu_y = PU_GET_Y(cu->part_size, width, y, i);
+    const int pu_w = PU_GET_W(cu->part_size, width, i);
+    const int pu_h = PU_GET_H(cu->part_size, width, i);
+
+    cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+
+    if (pu->inter.mv_dir == 3) {
+      const kvz_picture *const refs[2] = {
+        state->frame->ref->images[
+          state->frame->ref_LX[0][
+            pu->inter.mv_ref[0]]],
+        state->frame->ref->images[
+          state->frame->ref_LX[1][
+            pu->inter.mv_ref[1]]],
+      };
+      kvz_inter_recon_bipred(state,
+                             refs[0], refs[1],
+                             pu_x, pu_y,
+                             pu_w, pu_h,
+                             pu->inter.mv,
+                             lcu);
+    } else {
+      const int mv_idx = pu->inter.mv_dir - 1;
+      const kvz_picture *const ref =
+        state->frame->ref->images[
+          state->frame->ref_LX[mv_idx][
+            pu->inter.mv_ref[mv_idx]]];
+
+      inter_recon_unipred(state,
+                          ref,
+                          pu_x, pu_y,
+                          pu_w, pu_h,
+                          pu->inter.mv[mv_idx],
+                          lcu,
+                          NULL);
+    }
+  }
+}
+
 /**
  * \brief Clear unused L0/L1 motion vectors and reference
  * \param cu coding unit to clear
diff --git a/src/inter.h b/src/inter.h
index ee324fc1..3380cb1d 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -40,26 +40,21 @@ typedef struct {
 
 } inter_merge_cand_t;
 
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width);
 
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param[2],
-                         lcu_t* lcu,
-                         hi_prec_buf_t *hi_prec_out);
-
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param[2][2],
-                                lcu_t* lcu);
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param[2][2],
+                            lcu_t* lcu);
 
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t x,
diff --git a/src/search.c b/src/search.c
index 9943570c..31aafc87 100644
--- a/src/search.c
+++ b/src/search.c
@@ -538,47 +538,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       }
       kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth);
 
-      const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size];
-      for (int i = 0; i < num_pu; ++i) {
-        const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
-        const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
-        const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
-        const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
-
-        cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
-
-        if (cur_pu->inter.mv_dir == 3) {
-          const kvz_picture *const refs[2] = {
-            state->frame->ref->images[
-              state->frame->ref_LX[0][
-                cur_pu->inter.mv_ref[0]]],
-            state->frame->ref->images[
-              state->frame->ref_LX[1][
-                cur_pu->inter.mv_ref[1]]],
-          };
-          kvz_inter_recon_lcu_bipred(state,
-                                     refs[0], refs[1],
-                                     pu_x, pu_y,
-                                     pu_w, pu_h,
-                                     cur_pu->inter.mv,
-                                     lcu);
-        } else {
-          const int mv_idx = cur_pu->inter.mv_dir - 1;
-          
-          const kvz_picture *const ref =
-              state->frame->ref->images[
-                state->frame->ref_LX[mv_idx][
-                  cur_pu->inter.mv_ref[mv_idx]]];
-
-          kvz_inter_recon_lcu(state,
-                              ref,
-                              pu_x, pu_y,
-                              pu_w, pu_h,
-                              cur_pu->inter.mv[mv_idx],
-                              lcu,
-                              0);
-        }
-      }
+      kvz_inter_recon_cu(state, lcu, x, y, cu_width);
 
       const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
       kvz_quantize_lcu_residual(state,
diff --git a/src/search_inter.c b/src/search_inter.c
index 306f89e1..7e659c3b 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1495,18 +1495,18 @@ static void search_pu_inter(encoder_state_t * const state,
             continue;
           }
 
-          kvz_inter_recon_lcu_bipred(state,
-                                     state->frame->ref->images[
-                                       state->frame->ref_LX[0][merge_cand[i].ref[0]]
-                                     ],
-                                     state->frame->ref->images[
-                                       state->frame->ref_LX[1][merge_cand[j].ref[1]]
-                                     ],
-                                     x, y,
-                                     width,
-                                     height,
-                                     mv,
-                                     templcu);
+          kvz_inter_recon_bipred(state,
+                                 state->frame->ref->images[
+                                   state->frame->ref_LX[0][merge_cand[i].ref[0]]
+                                 ],
+                                 state->frame->ref->images[
+                                   state->frame->ref_LX[1][merge_cand[j].ref[1]]
+                                 ],
+                                 x, y,
+                                 width,
+                                 height,
+                                 mv,
+                                 templcu);
 
           for (int ypos = 0; ypos < height; ++ypos) {
             int dst_y = ypos * width;

From 8c534170064cabd28dbe6d16ff37bba5793d94a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 17 Jan 2018 13:39:20 +0200
Subject: [PATCH 2/2] Check zero coefficient cost for inter

Checks the cost of flushing all coefficients of an inter block to zero.
This is much faster than doing full RDOQ but can still reduce bitrate
significantly. Encoding speed is increased since fewer coefficient bits
have to be coded with CABAC.
---
 src/search.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 31aafc87..c02aeebe 100644
--- a/src/search.c
+++ b/src/search.c
@@ -392,6 +392,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
+  double inter_zero_coeff_cost = MAX_INT;
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
@@ -518,7 +519,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
-        if (state->encoder_control->cfg.rdo == 3) {
+        if (ctrl->cfg.rdo == 3) {
           cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
@@ -540,6 +541,30 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
       kvz_inter_recon_cu(state, lcu, x, y, cu_width);
 
+      if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        const int luma_index   = y_local * LCU_WIDTH + x_local;
+        const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
+
+        double ssd = 0.0;
+        ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
+          LCU_WIDTH, LCU_WIDTH, cu_width
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+
+        inter_zero_coeff_cost = ssd + inter_bitcost * state->lambda;
+
+        // Save the pixels at a lower level of the working tree.
+        copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]);
+      }
+
       const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
       kvz_quantize_lcu_residual(state,
                                 true, has_chroma,
@@ -549,7 +574,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
       int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
-      if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+      if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
         cur_cu->merged = 0;
         cur_cu->skipped = 1;
         // Selecting skip reduces bits needed to code the CU
@@ -575,6 +600,28 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     }
 
     cost += mode_bits * state->lambda;
+
+    if (inter_zero_coeff_cost <= cost) {
+      cost = inter_zero_coeff_cost;
+
+      // Restore saved pixels from lower level of the working tree.
+      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu);
+
+      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+        cur_cu->merged = 0;
+        cur_cu->skipped = 1;
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      }
+
+      if (cur_cu->tr_depth != depth) {
+        // Reset transform depth since there are no coefficients. This
+        // ensures that CBF is cleared for the whole area of the CU.
+        kvz_lcu_set_trdepth(lcu, x, y, depth, depth);
+      }
+
+      cur_cu->cbf = 0;
+      lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
+    }
   }
 
   bool can_split_cu =