diff --git a/src/search.c b/src/search.c
index 37aa153b..ca77222f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -524,7 +524,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         && WITHIN(depth, ctrl->pu_depth_inter.min, ctrl->pu_depth_inter.max);
 
     if (can_use_inter) {
-      int mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_tree[depth]);
+      double mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_tree[depth]);
       if (mode_cost < cost) {
         cost = mode_cost;
         cur_cu->type = CU_INTER;
diff --git a/src/search_inter.c b/src/search_inter.c
index 7a19b6ad..3388671b 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1149,8 +1149,6 @@ static void search_pu_inter_ref(const encoder_state_t * const state,
       break;
   }
 
-  assert(fracmv_within_tile(state, &orig, mv.x, mv.y, width, -1));
-  
   if (state->encoder_control->cfg->fme_level > 0) {
     temp_cost = search_frac(state,
                             width, height,
@@ -1163,7 +1161,6 @@ static void search_pu_inter_ref(const encoder_state_t * const state,
                             num_cand,
                             ref_idx,
                             &temp_bitcost);
-    assert(fracmv_within_tile(state, &orig, mv.x, mv.y, width, -1));
   }
   
   merged = 0;
@@ -1290,7 +1287,7 @@ static int search_pu_inter(const encoder_state_t * const state,
   cur_cu->inter.mv_cand[0] = 0;
   cur_cu->inter.mv_cand[1] = 0;
 
-  cur_cu->inter.cost = UINT_MAX;
+  cur_cu->inter.cost = INT_MAX;
 
   uint32_t ref_idx;
   for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
@@ -1345,16 +1342,12 @@ static int search_pu_inter(const encoder_state_t * const state,
           mv[1][0] = merge_cand[j].mv[1][0];
           mv[1][1] = merge_cand[j].mv[1][1];
 
-          // Check boundaries when using owf to process multiple frames at the same time
-          if (max_px_below_lcu >= 0) {
-            // When SAO is off, row is considered reconstructed when the last LCU
-            // is done, although the bottom 2 pixels might still need deblocking.
-            // To work around this, add 2 luma pixels to the reach of the mv
-            // in order to avoid referencing those possibly non-deblocked pixels.
-            int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int cur_lcu_row = y / LCU_WIDTH;
-            if (mv_lcu_row_reach_1 > cur_lcu_row + max_px_below_lcu || mv_lcu_row_reach_2 > cur_lcu_row + max_px_below_lcu) {
+          {
+            // Don't try merge candidates that don't satisfy mv constraints.
+            vector2d_t orig = { x, y };
+            if (fracmv_within_tile(state, &orig, mv[0][0], mv[0][1], width, -1) ||
+                fracmv_within_tile(state, &orig, mv[1][0], mv[1][1], width, -1))
+            {
               continue;
             }
           }
@@ -1448,6 +1441,13 @@ static int search_pu_inter(const encoder_state_t * const state,
     FREE_POINTER(templcu);
   }
 
+  if (cur_cu->inter.cost < INT_MAX) {
+    const vector2d_t orig = { x, y };
+    if (cur_cu->inter.mv_dir == 1) {
+      assert(fracmv_within_tile(state, &orig, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1], width, -1));
+    }
+  }
+
   return cur_cu->inter.cost;
 }