Merge branch 'mv-constraint'

2024-11-27 19:24:06 +00:00 · 2016-02-29 23:33:15 +02:00 · 2016-02-29 23:33:15 +02:00 · c02b0f4186
parent 1be877faf9 cfa722e448
commit c02b0f4186
7 changed files with 171 additions and 48 deletions
--- a/configure.ac
+++ b/configure.ac
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=3
-ver_minor=2
+ver_minor=4
 ver_release=0

 # not used, but it prevents configure from adding a lot of defines to the CFLAGS
--- a/src/cfg.c
+++ b/src/cfg.c
@ -102,6 +102,8 @@ int kvz_config_init(kvz_config *cfg)
  cfg->add_encoder_info = true;
  cfg->calc_psnr = true;

+  cfg->mv_constraint = KVZ_MV_CONSTRAIN_NONE;
+
  return 1;
 }

@ -279,6 +281,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
                                                    "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", NULL };
  static const char * const colormatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m",
                                                    "smpte240m", "YCgCo", "bt2020nc", "bt2020c", NULL };
+  static const char * const mv_constraint_names[] = { "none", "frame", "tile", "frametile", "frametilemargin", NULL };

  static const char * const preset_values[11][28] = {
      { 
@ -533,6 +536,13 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
    cfg->fme_level = atoi(value);
  else if OPT("source-scan-type")
    return parse_enum(value, source_scan_type_names, &cfg->source_scan_type);
+  else if OPT("mv-constraint")
+  {
+    int8_t constraint = KVZ_MV_CONSTRAIN_NONE;
+    int result = parse_enum(value, mv_constraint_names, &constraint);
+    cfg->mv_constraint = constraint;
+    return result;
+  }
  else if OPT("sar")
    return sscanf(value, "%d:%d", &cfg->vui.sar_width, &cfg->vui.sar_height) == 2;
  else if OPT("overscan")
--- a/src/cli.c
+++ b/src/cli.c
@ -102,6 +102,7 @@ static const struct option long_options[] = {
  { "version",                  no_argument, NULL, 0 },
  { "help",                     no_argument, NULL, 0 },
  { "loop-input",               no_argument, NULL, 0 },
+  { "mv-constraint",      required_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -363,6 +364,10 @@ void print_help(void)
    "                                     fast, medium, slow, slower, veryslow, placebo\n"
    "          --no-psnr              : Don't calculate PSNR for frames\n"
    "          --loop-input           : Re-read input file forever\n"
+    "          --mv-constraint        : Constrain movement vectors\n"
+    "                                     \"none\": no constraint\n"
+    "                                     \"frametile\": constrain within the tile\n"
+    "                                     \"frametilemargin\": constrain even more\n"
    "\n"
    "  Video Usability Information:\n"
    "          --sar <width:height>   : Specify Sample Aspect Ratio\n"
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -577,10 +577,19 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
 #endif
          main_state->children[i].tqj_recon_done = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->children[i]), 1, job_description);
          if (main_state->children[i].previous_encoder_state != &main_state->children[i] && main_state->children[i].previous_encoder_state->tqj_recon_done && !main_state->children[i].global->is_idr_frame) {
-            // Add dependancy to each child in the previous frame.
-            // TODO: Make it so that only adjacent tiles are dependet upon and search is constrained to those?
-            for (int child_id = 0; main_state->children[child_id].encoder_control; ++child_id) {
-              kvz_threadqueue_job_dep_add(main_state->children[i].tqj_recon_done, main_state->children[child_id].previous_encoder_state->tqj_recon_done);
+#if 0
+            // Disabled due to non-determinism.
+            if (main_state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN)
+            {
+              // When MV's don't cross tile boundaries, add dependancy only to the same tile.
+              kvz_threadqueue_job_dep_add(main_state->children[i].tqj_recon_done, main_state->children[i].previous_encoder_state->tqj_recon_done);
+            } else 
+#endif      
+            {
+              // Add dependancy to each child in the previous frame.
+              for (int child_id = 0; main_state->children[child_id].encoder_control; ++child_id) {
+                kvz_threadqueue_job_dep_add(main_state->children[i].tqj_recon_done, main_state->children[child_id].previous_encoder_state->tqj_recon_done);
+              }
            }
          }
          kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, main_state->children[i].tqj_recon_done);
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@ -101,6 +101,19 @@ enum kvz_interlacing
  KVZ_INTERLACING_BFF = 2, // bottom field first
 };

+/**
+* \brief Constrain movement vectors.
+* \since 3.3.0
+*/
+enum kvz_mv_constraint
+{
+  KVZ_MV_CONSTRAIN_NONE = 0,
+  KVZ_MV_CONSTRAIN_FRAME = 1,  // Don't refer outside the frame.
+  KVZ_MV_CONSTRAIN_TILE = 2,  // Don't refer to other tiles.
+  KVZ_MV_CONSTRAIN_FRAME_AND_TILE = 3,  // Don't refer outside the tile.
+  KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN = 4,  // Keep enough margin for fractional pixel margins not to refer outside the tile.
+};
+
 /**
 * \brief GoP picture configuration.
 */
@ -204,6 +217,8 @@ typedef struct kvz_config

  int8_t mv_rdo;            /*!< \brief MV RDO calculation in search (0: estimation, 1: RDO). */
  int8_t calc_psnr;         /*!< \since 3.1.0 \brief Print PSNR in CLI. */
+
+  enum kvz_mv_constraint mv_constraint;  /*!< \since 3.3.0 \brief Constrain movement vectors. */
 } kvz_config;

 /**
--- a/src/search.c
+++ b/src/search.c
@ -524,7 +524,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        && WITHIN(depth, ctrl->pu_depth_inter.min, ctrl->pu_depth_inter.max);

    if (can_use_inter) {
-      int mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_tree[depth]);
+      double mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_tree[depth]);
      if (mode_cost < cost) {
        cost = mode_cost;
        cur_cu->type = CU_INTER;
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -28,6 +28,46 @@
 #include "rdo.h"


+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int wpp_limit)
+{
+  if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
+    return (wpp_limit == -1 || y + (width << 2) <= (wpp_limit << 2));
+  };
+
+  int lt_margin = 0;
+  int rb_margin = 0;
+  if (KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
+    // Enforce a distance of 8 from any tile boundary.
+    lt_margin = 8;
+    rb_margin = 16;
+  }
+
+  // TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE.
+  const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y };
+
+  if (abs_mv.x >= lt_margin && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) - rb_margin &&
+      abs_mv.y >= lt_margin && abs_mv.y + (width << 2) <= (state->tile->frame->height << 2) - rb_margin &&
+      (wpp_limit == -1 || y + (width << 2) <= (wpp_limit << 2)))
+  {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int wpp_limit)
+{
+  return fracmv_within_tile(state, orig, x << 2, y << 2, width, wpp_limit);
+}
+
+
 static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
 {
  int32_t num_bins = 0;
@ -125,7 +165,7 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int
 unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
                           const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
                           int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                           int width, int height, int max_px_below_lcu)
+                           int width, int height, int wpp_limit)
 {
  int n_points;
  int best_index = -1;
@ -243,6 +283,10 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
  for (i = 0; i < n_points; i++)
  {
    vector2d_t *current = &pattern[pattern_type][i];
+    if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width, wpp_limit)) {
+      continue;
+    }
+
    unsigned cost;
    uint32_t bitcost;

@ -251,7 +295,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-                            width, height, max_px_below_lcu);
+                            width, height, -1);
      cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -287,7 +331,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
 unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
                          const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                          int width, int height, int iSearchRange, int iRaster, int max_px_below_lcu)
+                          int width, int height, int iSearchRange, int iRaster, int wpp_limit)
 {
  int i;
  int k;
@ -307,6 +351,10 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic
    for (k = -iSearchRange; k <= iSearchRange; k += iRaster)
    {
      vector2d_t current = { k, i };
+      if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width, wpp_limit)) {
+        continue;
+      }
+
      unsigned cost;
      uint32_t bitcost;

@ -315,7 +363,7 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic
        cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          width, height, max_px_below_lcu);
+          width, height, -1);
        cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

        PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -366,7 +414,7 @@ static unsigned tz_search(const encoder_state_t * const state,
  int iDist;
  int best_dist = 0;
  unsigned best_index = num_cand;
-  int max_px_below_lcu = -1;
+  int wpp_limit = -1;

  int(*calc_mvd)(const encoder_state_t * const, int, int, int,
    int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -376,28 +424,28 @@ static unsigned tz_search(const encoder_state_t * const state,
  }

  if (state->encoder_control->owf) {
-    max_px_below_lcu = LCU_WIDTH;
+    wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH;
    if (state->encoder_control->fme_level > 0) {
      // Fractional motion estimation can change the mv by at most 1 pixel.
-      max_px_below_lcu -= 1;
+      wpp_limit -= 1;
    }
    if (state->encoder_control->deblock_enable) {
      // Strong deblock filter modifies 3 pixels.
-      max_px_below_lcu -= 3;
+      wpp_limit -= 3;
    }
  }

  //step 1, compare (0,0) vector to predicted vectors
  
  // Check whatever input vector we got, unless its (0, 0) which will be checked later.
-  if (mv.x || mv.y) 
+  if ((mv.x || mv.y) && intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit))
  {
    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);

    best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        width, height, max_px_below_lcu);
+                                        width, height, -1);
    best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);

    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -415,6 +463,9 @@ static unsigned tz_search(const encoder_state_t * const state,
    if (merge_cand[i].dir == 3) continue;
    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
+    if (!intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) {
+      continue;
+    }

    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);

@ -422,7 +473,7 @@ static unsigned tz_search(const encoder_state_t * const state,
    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   width, height, max_px_below_lcu);
+                                   width, height, -1);
    cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -450,7 +501,7 @@ static unsigned tz_search(const encoder_state_t * const state,
  for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
  {
    best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
-                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
+                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
  }

  //step 3, raster scan
@ -459,7 +510,7 @@ static unsigned tz_search(const encoder_state_t * const state,
    best_dist = iRaster;

    best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand,
-                                 num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, max_px_below_lcu);
+                                 num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, wpp_limit);
  }

  //step 4
@ -471,7 +522,7 @@ static unsigned tz_search(const encoder_state_t * const state,
    while (iDist > 0)
    {
      best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);

      iDist = iDist >> 1;
    }
@ -483,7 +534,7 @@ static unsigned tz_search(const encoder_state_t * const state,
    for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
    {
      best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
    }
  }

@ -552,7 +603,7 @@ static unsigned hexagon_search(const encoder_state_t * const state,
  uint32_t best_bitcost = 0, bitcost;
  unsigned i;
  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
-  int max_px_below_lcu = -1;
+  int wpp_limit = -1;

  int (*calc_mvd)(const encoder_state_t * const, int, int, int,
    int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -561,16 +612,15 @@ static unsigned hexagon_search(const encoder_state_t * const state,
    calc_mvd = kvz_calc_mvd_cost_cabac;
  }

-  
  if (state->encoder_control->owf) {
-    max_px_below_lcu = LCU_WIDTH;
+    wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH;
    if (state->encoder_control->fme_level > 0) {
      // Fractional motion estimation can change the mv by at most 1 pixel.
-      max_px_below_lcu -= 1;
+      wpp_limit -= 1;
    }
    if (state->encoder_control->deblock_enable) {
      // Strong deblock filter modifies 3 pixels.
-      max_px_below_lcu -= 3;
+      wpp_limit -= 3;
    }
  }

@ -585,13 +635,15 @@ static unsigned hexagon_search(const encoder_state_t * const state,
    }
  }

-  if (!mv_in_merge_cand) {
+  if (!mv_in_merge_cand &&
+    intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit))
+  {
    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);

    best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        width, height, max_px_below_lcu);
+                                        width, height, -1);
    best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
    best_bitcost = bitcost;
    best_index = num_cand; 
@ -609,13 +661,16 @@ static unsigned hexagon_search(const encoder_state_t * const state,
    if (merge_cand[i].dir == 3) continue;
    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
+    if (!intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) {
+      continue;
+    }

    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);

    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   width, height, max_px_below_lcu);
+                                   width, height, -1);
    cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -642,13 +697,17 @@ static unsigned hexagon_search(const encoder_state_t * const state,
  best_index = 0;
  for (i = 0; i < 7; ++i) {
    const vector2d_t *pattern = &large_hexbs[i];
+    if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, wpp_limit)) {
+      continue;
+    }
+
    unsigned cost;
    {
      PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
-                             width, height, max_px_below_lcu);
+                             width, height, -1);
      cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -685,13 +744,17 @@ static unsigned hexagon_search(const encoder_state_t * const state,
    // Iterate through the next 3 points.
    for (i = 0; i < 3; ++i) {
      const vector2d_t *offset = &large_hexbs[start + i];
+      if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, wpp_limit)) {
+        continue;
+      }
+
      unsigned cost;
      {
        PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
        cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                               (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                               (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                               width, height, max_px_below_lcu);
+                               width, height, -1);
        cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
        PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
@ -717,13 +780,17 @@ static unsigned hexagon_search(const encoder_state_t * const state,
  // Do the final step of the search with a small pattern.
  for (i = 1; i < 5; ++i) {
    const vector2d_t *offset = &small_hexbs[i];
+    if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, wpp_limit)) {
+      continue;
+    }
+
    unsigned cost;
    {
      PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                             width, height, max_px_below_lcu);
+                             width, height, -1);
      cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
@ -847,12 +914,21 @@ static unsigned search_frac(const encoder_state_t * const state,
      { -1, -1 }, { 0, -1 }, { 1, -1 }
  };

+  int wpp_limit = -1;
+  if (state->encoder_control->owf) {
+    wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH;
+    if (state->encoder_control->deblock_enable) {
+      // Strong deblock filter modifies 3 pixels.
+      wpp_limit -= 3;
+    }
+  }
+
  //Set mv to halfpel precision
  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
  unsigned best_cost = UINT32_MAX;
  uint32_t best_bitcost = 0, bitcost;
  unsigned i;
-  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
+  unsigned best_index = 4;

  unsigned cost = 0;

@ -896,6 +972,9 @@ static unsigned search_frac(const encoder_state_t * const state,
  // Search halfpel positions around best integer mv
  for (i = 0; i < 9; ++i) {
    const vector2d_t *pattern = &square[i];
+    if (!fracmv_within_tile(state, orig, (mv.x + pattern->x) << 1, (mv.y + pattern->y) << 1, width, wpp_limit)) {
+      continue;
+    }

    int y,x;
    for(y = 0; y < height; ++y) {
@ -911,7 +990,6 @@ static unsigned search_frac(const encoder_state_t * const state,
                             tmp_filtered, width);

    cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
    if (cost < best_cost) {
      best_cost    = cost;
      best_index   = i;
@ -920,12 +998,12 @@ static unsigned search_frac(const encoder_state_t * const state,
    }
  }

-  //Set mv to best match
+  // Move search to best_index
  mv.x += square[best_index].x;
  mv.y += square[best_index].y;
-
  halfpel_offset.x = square[best_index].x*2;
  halfpel_offset.y = square[best_index].y*2;
+  best_index = 4;

  //Set mv to quarterpel precision
  mv.x <<= 1;
@ -934,6 +1012,9 @@ static unsigned search_frac(const encoder_state_t * const state,
  //Search quarterpel points around best halfpel mv
  for (i = 0; i < 9; ++i) {
    const vector2d_t *pattern = &square[i];
+    if (!fracmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, wpp_limit)) {
+      continue;
+    }

    int y,x;
    for(y = 0; y < height; ++y) {
@ -1081,7 +1162,7 @@ static void search_pu_inter_ref(const encoder_state_t * const state,
                            ref_idx,
                            &temp_bitcost);
  }
-
+  
  merged = 0;
  // Check every candidate to find a match
  for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
@ -1206,7 +1287,7 @@ static int search_pu_inter(const encoder_state_t * const state,
  cur_cu->inter.mv_cand[0] = 0;
  cur_cu->inter.mv_cand[1] = 0;

-  cur_cu->inter.cost = UINT_MAX;
+  cur_cu->inter.cost = INT_MAX;

  uint32_t ref_idx;
  for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
@ -1261,16 +1342,12 @@ static int search_pu_inter(const encoder_state_t * const state,
          mv[1][0] = merge_cand[j].mv[1][0];
          mv[1][1] = merge_cand[j].mv[1][1];

-          // Check boundaries when using owf to process multiple frames at the same time
-          if (max_px_below_lcu >= 0) {
-            // When SAO is off, row is considered reconstructed when the last LCU
-            // is done, although the bottom 2 pixels might still need deblocking.
-            // To work around this, add 2 luma pixels to the reach of the mv
-            // in order to avoid referencing those possibly non-deblocked pixels.
-            int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int cur_lcu_row = y / LCU_WIDTH;
-            if (mv_lcu_row_reach_1 > cur_lcu_row + max_px_below_lcu || mv_lcu_row_reach_2 > cur_lcu_row + max_px_below_lcu) {
+          {
+            // Don't try merge candidates that don't satisfy mv constraints.
+            vector2d_t orig = { x, y };
+            if (fracmv_within_tile(state, &orig, mv[0][0], mv[0][1], width, -1) ||
+                fracmv_within_tile(state, &orig, mv[1][0], mv[1][1], width, -1))
+            {
              continue;
            }
          }
@ -1364,6 +1441,13 @@ static int search_pu_inter(const encoder_state_t * const state,
    FREE_POINTER(templcu);
  }

+  if (cur_cu->inter.cost < INT_MAX) {
+    const vector2d_t orig = { x, y };
+    if (cur_cu->inter.mv_dir == 1) {
+      assert(fracmv_within_tile(state, &orig, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1], width, -1));
+    }
+  }
+
  return cur_cu->inter.cost;
 }