Fix bug with OWF+FME+deblocking

Increases the MV safety margin of OWF from 2 to 3 when deblocking is used and 4 when both deblocking and FME are used. Fractional pixel motion estimation can move the vector one more pixel down causing checksum error. This fixes that error by increasing the OWF safety margin and changes the interface, so that different margin can be used when FME or deblocking are not in use.
2024-11-27 19:24:06 +00:00 · 2015-12-04 12:28:26 +02:00 · 2015-12-04 12:28:26 +02:00 · c94707e6e8
parent f2d8cd4d64
commit c94707e6e8
2 changed files with 60 additions and 44 deletions
--- a/src/image.c
+++ b/src/image.c
@ -428,20 +428,15 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
 * \returns  
 */
 unsigned kvz_image_calc_sad(const kvz_picture *pic, const kvz_picture *ref, int pic_x, int pic_y, int ref_x, int ref_y,
-                        int block_width, int block_height, int max_lcu_below) {
+                        int block_width, int block_height, int max_px_below_lcu) {
  assert(pic_x >= 0 && pic_x <= pic->width - block_width);
  assert(pic_y >= 0 && pic_y <= pic->height - block_height);
  
  // Check that we are not referencing pixels that are not final.
-  if (max_lcu_below >= 0) {
-    // When SAO is off, row is considered reconstructed when the last LCU
-    // is done, although the bottom 2 pixels might still need deblocking.
-    // To work around this, add 2 luma pixels to the reach of the mv
-    // in order to avoid referencing those possibly non-deblocked pixels.
-    int mv_lcu_row_reach = (ref_y + block_height - 1 + 2) / LCU_WIDTH;
-    int cur_lcu_row = pic_y / LCU_WIDTH;
-    if (mv_lcu_row_reach > cur_lcu_row + max_lcu_below) {
-      //printf("OOB %d %d -> %d\n", ref_y + block_height, pic_y, ref_y + block_height - pic_y);
+  if (max_px_below_lcu >= 0) {
+    int next_lcu_row_px = ((pic_y >> LOG2_LCU_WIDTH) + 1) << LOG2_LCU_WIDTH;
+    int px_below_lcu = ref_y + block_height - next_lcu_row_px;
+    if (px_below_lcu > max_px_below_lcu) {
      return INT_MAX;
    }
  }
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -133,7 +133,7 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int
 unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
                           const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
                           int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                           int block_width, int max_lcu_below)
+                           int block_width, int max_px_below_lcu)
 {
  int n_points;
  int best_index = -1;
@ -259,7 +259,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-                            block_width, block_width, max_lcu_below);
+                            block_width, block_width, max_px_below_lcu);
      cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -295,7 +295,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
 unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
                          const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                          int block_width, int iSearchRange, int iRaster, int max_lcu_below)
+                          int block_width, int iSearchRange, int iRaster, int max_px_below_lcu)
 {
  int i;
  int k;
@ -323,7 +323,7 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic
        cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          block_width, block_width, max_lcu_below);
+          block_width, block_width, max_px_below_lcu);
        cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

        PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -375,7 +375,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
  int iDist;
  int best_dist = 0;
  unsigned best_index = num_cand;
-  int max_lcu_below = -1;
+  int max_px_below_lcu = -1;

  int(*calc_mvd)(const encoder_state_t * const, int, int, int,
    int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -385,7 +385,15 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
  }

  if (state->encoder_control->owf) {
-    max_lcu_below = 1;
+    max_px_below_lcu = LCU_WIDTH;
+    if (state->encoder_control->fme_level > 0) {
+      // Fractional motion estimation can change the mv by at most 1 pixel.
+      max_px_below_lcu -= 1;
+    }
+    if (state->encoder_control->deblock_enable) {
+      // Strong deblock filter modifies 3 pixels.
+      max_px_below_lcu -= 3;
+    }
  }

  //step 1, compare (0,0) vector to predicted vectors
@ -398,7 +406,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
    best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        block_width, block_width, max_lcu_below);
+                                        block_width, block_width, max_px_below_lcu);
    best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);

    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -423,7 +431,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   block_width, block_width, max_lcu_below);
+                                   block_width, block_width, max_px_below_lcu);
    cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -451,7 +459,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
  for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
  {
    best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
-                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
+                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_px_below_lcu);
  }

  //step 3, raster scan
@ -460,7 +468,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
    best_dist = iRaster;

    best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand, 
-                                 num_cand, ref_idx, &best_bitcost, block_width, iSearchRange, iRaster, max_lcu_below);
+                                 num_cand, ref_idx, &best_bitcost, block_width, iSearchRange, iRaster, max_px_below_lcu);
  }

  //step 4
@ -472,7 +480,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
    while (iDist > 0)
    {
      best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_px_below_lcu);

      iDist = iDist >> 1;
    }
@ -484,7 +492,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
    for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
    {
      best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_px_below_lcu);
    }
  }

@ -552,7 +560,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
  uint32_t best_bitcost = 0, bitcost;
  unsigned i;
  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
-  int max_lcu_below = -1;
+  int max_px_below_lcu = -1;

  int (*calc_mvd)(const encoder_state_t * const, int, int, int,
    int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -563,7 +571,15 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep

  
  if (state->encoder_control->owf) {
-    max_lcu_below = 1;
+    max_px_below_lcu = LCU_WIDTH;
+    if (state->encoder_control->fme_level > 0) {
+      // Fractional motion estimation can change the mv by at most 1 pixel.
+      max_px_below_lcu -= 1;
+    }
+    if (state->encoder_control->deblock_enable) {
+      // Strong deblock filter modifies 3 pixels.
+      max_px_below_lcu -= 3;
+    }
  }

  // Check mv_in, if it's not in merge candidates.
@ -583,7 +599,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
    best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        block_width, block_width, max_lcu_below);
+                                        block_width, block_width, max_px_below_lcu);
    best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
    best_bitcost = bitcost;
    best_index = num_cand; 
@ -607,7 +623,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   block_width, block_width, max_lcu_below);
+                                   block_width, block_width, max_px_below_lcu);
    cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -640,7 +656,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
-                             block_width, block_width, max_lcu_below);
+                             block_width, block_width, max_px_below_lcu);
      cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);

      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -683,7 +699,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
        cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                               (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                               (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                               block_width, block_width, max_lcu_below);
+                               block_width, block_width, max_px_below_lcu);
        cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
        PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
@ -715,7 +731,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                             block_width, block_width, max_lcu_below);
+                             block_width, block_width, max_px_below_lcu);
      cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
@ -1095,11 +1111,17 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
    get_mvd_cost = kvz_get_mvd_coding_cost_cabac;
  }

-
-  int max_lcu_below = -1;
-  
+  int max_px_below_lcu = -1;
  if (state->encoder_control->owf) {
-    max_lcu_below = 1;
+    max_px_below_lcu = LCU_WIDTH;
+    if (state->encoder_control->fme_level > 0) {
+      // Fractional motion estimation can change the mv by at most 1 pixel.
+      max_px_below_lcu -= 1;
+    }
+    if (state->encoder_control->deblock_enable) {
+      // Strong deblock filter modifies 3 pixels.
+      max_px_below_lcu -= 3;
+    }
  }

  // Default to candidate 0
@ -1159,17 +1181,16 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
          mv[1][1] = merge_cand[j].mv[1][1];

          // Check boundaries when using owf to process multiple frames at the same time
-          if (max_lcu_below >= 0) {
-            // When SAO is off, row is considered reconstructed when the last LCU
-            // is done, although the bottom 2 pixels might still need deblocking.
-            // To work around this, add 2 luma pixels to the reach of the mv
-            // in order to avoid referencing those possibly non-deblocked pixels.
-            int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int cur_lcu_row = y / LCU_WIDTH;
-            if (mv_lcu_row_reach_1 > cur_lcu_row + max_lcu_below || mv_lcu_row_reach_2 > cur_lcu_row + max_lcu_below) {
-              continue;
-            }
+          if (max_px_below_lcu >= 0) {
+            // The following has been modified to fit testing two mv's, but it's
+            // equivalent to: y + mv + block_height - next_lcu_row_px > max_px_below_lcu
+            int next_lcu_row_px = ((y >> LOG2_LCU_WIDTH) + 1) << LOG2_LCU_WIDTH;
+            int block_height = LCU_WIDTH >> depth;
+            int max_mv = next_lcu_row_px - y - block_height + max_px_below_lcu;
+            int ceil_mv_l0 = ((mv[0][1] + 3) >> 2);
+            int ceil_mv_l1 = ((mv[1][1] + 3) >> 2);
+
+            if (ceil_mv_l0 < max_mv || ceil_mv_l1 < max_mv) continue;
          }

          kvz_inter_recon_lcu_bipred(state, state->global->ref->images[merge_cand[i].ref[0]], state->global->ref->images[merge_cand[j].ref[1]], x, y, LCU_WIDTH >> depth, mv, templcu);