diff --git a/src/inter.c b/src/inter.c
index 51cf1f05..ce9dca48 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -306,27 +306,27 @@ static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride,
 
 
 /**
- * \brief Reconstruct inter block
+ * \brief Reconstruct an inter PU using uniprediction.
  *
  * \param state         encoder state
  * \param ref           picture to copy the data from
- * \param xpos          block x position
- * \param ypos          block y position
- * \param width         block width
- * \param height        block height
+ * \param xpos          PU x position
+ * \param ypos          PU y position
+ * \param width         PU width
+ * \param height        PU height
  * \param mv_param      motion vector
  * \param lcu           destination lcu
- * \param hi_prec_out   destination of high precision output (null if not needed)
+ * \param hi_prec_out   destination of high precision output, or NULL if not needed
 */
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * const ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param[2],
-                         lcu_t *lcu,
-                         hi_prec_buf_t *hi_prec_out)
+static void inter_recon_unipred(const encoder_state_t * const state,
+                                const kvz_picture * const ref,
+                                int32_t xpos,
+                                int32_t ypos,
+                                int32_t width,
+                                int32_t height,
+                                const int16_t mv_param[2],
+                                lcu_t *lcu,
+                                hi_prec_buf_t *hi_prec_out)
 {
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
@@ -428,27 +428,27 @@ void kvz_inter_recon_lcu(const encoder_state_t * const state,
 }
 
 /**
- * \brief Reconstruct bi-pred inter block
+ * \brief Reconstruct bi-pred inter PU
  *
  * \param state     encoder state
  * \param ref1      reference picture to copy the data from
  * \param ref2      other reference picture to copy the data from
- * \param xpos      block x position
- * \param ypos      block y position
- * \param width     block width
- * \param height    block height
+ * \param xpos      PU x position
+ * \param ypos      PU y position
+ * \param width     PU width
+ * \param height    PU height
  * \param mv_param  motion vectors
  * \param lcu       destination lcu
  */
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param[2][2],
-                                lcu_t* lcu)
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param[2][2],
+                            lcu_t* lcu)
 {
   kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH];
   kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C];
@@ -468,7 +468,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
   if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
   if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
   //Reconstruct both predictors
-  kvz_inter_recon_lcu(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0);
+  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0);
   if (!hi_prec_luma_rec0){
     memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64);
   }
@@ -476,7 +476,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
     memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32);
     memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32);
   }
-  kvz_inter_recon_lcu(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1);
+  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
   for (temp_y = 0; temp_y < height; ++temp_y) {
@@ -506,6 +506,69 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
   if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
 }
 
+
+/**
+ * Reconstruct a single CU.
+ *
+ * The CU may consist of multiple PUs, each of which can use either
+ * uniprediction or biprediction.
+ *
+ * \param state   encoder state
+ * \param lcu     containing LCU
+ * \param x       x-coordinate of the CU in pixels
+ * \param y       y-coordinate of the CU in pixels
+ * \param width   CU width
+ */
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width)
+{
+  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+
+  const int num_pu = kvz_part_mode_num_parts[cu->part_size];
+  for (int i = 0; i < num_pu; ++i) {
+    const int pu_x = PU_GET_X(cu->part_size, width, x, i);
+    const int pu_y = PU_GET_Y(cu->part_size, width, y, i);
+    const int pu_w = PU_GET_W(cu->part_size, width, i);
+    const int pu_h = PU_GET_H(cu->part_size, width, i);
+
+    cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+
+    if (pu->inter.mv_dir == 3) {
+      const kvz_picture *const refs[2] = {
+        state->frame->ref->images[
+          state->frame->ref_LX[0][
+            pu->inter.mv_ref[0]]],
+        state->frame->ref->images[
+          state->frame->ref_LX[1][
+            pu->inter.mv_ref[1]]],
+      };
+      kvz_inter_recon_bipred(state,
+                             refs[0], refs[1],
+                             pu_x, pu_y,
+                             pu_w, pu_h,
+                             pu->inter.mv,
+                             lcu);
+    } else {
+      const int mv_idx = pu->inter.mv_dir - 1;
+      const kvz_picture *const ref =
+        state->frame->ref->images[
+          state->frame->ref_LX[mv_idx][
+            pu->inter.mv_ref[mv_idx]]];
+
+      inter_recon_unipred(state,
+                          ref,
+                          pu_x, pu_y,
+                          pu_w, pu_h,
+                          pu->inter.mv[mv_idx],
+                          lcu,
+                          NULL);
+    }
+  }
+}
+
 /**
  * \brief Clear unused L0/L1 motion vectors and reference
  * \param cu coding unit to clear
diff --git a/src/inter.h b/src/inter.h
index ee324fc1..3380cb1d 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -40,26 +40,21 @@ typedef struct {
 
 } inter_merge_cand_t;
 
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width);
 
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param[2],
-                         lcu_t* lcu,
-                         hi_prec_buf_t *hi_prec_out);
-
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param[2][2],
-                                lcu_t* lcu);
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param[2][2],
+                            lcu_t* lcu);
 
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t x,
diff --git a/src/search.c b/src/search.c
index 9943570c..c02aeebe 100644
--- a/src/search.c
+++ b/src/search.c
@@ -392,6 +392,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
+  double inter_zero_coeff_cost = MAX_INT;
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
@@ -518,7 +519,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
-        if (state->encoder_control->cfg.rdo == 3) {
+        if (ctrl->cfg.rdo == 3) {
           cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
@@ -538,46 +539,30 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       }
       kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth);
 
-      const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size];
-      for (int i = 0; i < num_pu; ++i) {
-        const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
-        const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
-        const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
-        const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
+      kvz_inter_recon_cu(state, lcu, x, y, cu_width);
 
-        cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+      if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        const int luma_index   = y_local * LCU_WIDTH + x_local;
+        const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
 
-        if (cur_pu->inter.mv_dir == 3) {
-          const kvz_picture *const refs[2] = {
-            state->frame->ref->images[
-              state->frame->ref_LX[0][
-                cur_pu->inter.mv_ref[0]]],
-            state->frame->ref->images[
-              state->frame->ref_LX[1][
-                cur_pu->inter.mv_ref[1]]],
-          };
-          kvz_inter_recon_lcu_bipred(state,
-                                     refs[0], refs[1],
-                                     pu_x, pu_y,
-                                     pu_w, pu_h,
-                                     cur_pu->inter.mv,
-                                     lcu);
-        } else {
-          const int mv_idx = cur_pu->inter.mv_dir - 1;
-          
-          const kvz_picture *const ref =
-              state->frame->ref->images[
-                state->frame->ref_LX[mv_idx][
-                  cur_pu->inter.mv_ref[mv_idx]]];
+        double ssd = 0.0;
+        ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
+          LCU_WIDTH, LCU_WIDTH, cu_width
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
+        ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+          &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
+          LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+        );
 
-          kvz_inter_recon_lcu(state,
-                              ref,
-                              pu_x, pu_y,
-                              pu_w, pu_h,
-                              cur_pu->inter.mv[mv_idx],
-                              lcu,
-                              0);
-        }
+        inter_zero_coeff_cost = ssd + inter_bitcost * state->lambda;
+
+        // Save the pixels at a lower level of the working tree.
+        copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]);
       }
 
       const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
@@ -589,7 +574,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
       int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
-      if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+      if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
         cur_cu->merged = 0;
         cur_cu->skipped = 1;
         // Selecting skip reduces bits needed to code the CU
@@ -615,6 +600,28 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     }
 
     cost += mode_bits * state->lambda;
+
+    if (inter_zero_coeff_cost <= cost) {
+      cost = inter_zero_coeff_cost;
+
+      // Restore saved pixels from lower level of the working tree.
+      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu);
+
+      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+        cur_cu->merged = 0;
+        cur_cu->skipped = 1;
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      }
+
+      if (cur_cu->tr_depth != depth) {
+        // Reset transform depth since there are no coefficients. This
+        // ensures that CBF is cleared for the whole area of the CU.
+        kvz_lcu_set_trdepth(lcu, x, y, depth, depth);
+      }
+
+      cur_cu->cbf = 0;
+      lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
+    }
   }
 
   bool can_split_cu =
diff --git a/src/search_inter.c b/src/search_inter.c
index 306f89e1..7e659c3b 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1495,18 +1495,18 @@ static void search_pu_inter(encoder_state_t * const state,
             continue;
           }
 
-          kvz_inter_recon_lcu_bipred(state,
-                                     state->frame->ref->images[
-                                       state->frame->ref_LX[0][merge_cand[i].ref[0]]
-                                     ],
-                                     state->frame->ref->images[
-                                       state->frame->ref_LX[1][merge_cand[j].ref[1]]
-                                     ],
-                                     x, y,
-                                     width,
-                                     height,
-                                     mv,
-                                     templcu);
+          kvz_inter_recon_bipred(state,
+                                 state->frame->ref->images[
+                                   state->frame->ref_LX[0][merge_cand[i].ref[0]]
+                                 ],
+                                 state->frame->ref->images[
+                                   state->frame->ref_LX[1][merge_cand[j].ref[1]]
+                                 ],
+                                 x, y,
+                                 width,
+                                 height,
+                                 mv,
+                                 templcu);
 
           for (int ypos = 0; ypos < height; ++ypos) {
             int dst_y = ypos * width;