diff --git a/src/search_inter.c b/src/search_inter.c index 8e7adfd8..3e84d191 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -28,20 +28,21 @@ #include "rdo.h" - /** * \return True if referred block is within current tile. */ -static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width) +static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int wpp_limit) { - if (state->encoder_control->cfg->mv_constraint != KVZ_MV_CONSTRAIN_FRAME_AND_TILE) return true; + if (state->encoder_control->cfg->mv_constraint != KVZ_MV_CONSTRAIN_FRAME_AND_TILE) { + return (wpp_limit == -1 || y + (width << 2) <= (wpp_limit << 2)); + }; // TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE. - - vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y }; + const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y }; if (abs_mv.x >= 0 && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) && - abs_mv.y >= 0 && abs_mv.y + (width << 2) <= (state->tile->frame->height << 2)) + abs_mv.y >= 0 && abs_mv.y + (width << 2) <= (state->tile->frame->height << 2) && + (wpp_limit == -1 || y + (width << 2) <= (wpp_limit << 2))) { return true; } else { @@ -53,9 +54,9 @@ static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector /** * \return True if referred block is within current tile. */ -static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width) +static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int wpp_limit) { - return fracmv_within_tile(state, orig, x << 2, y << 2, width); + return fracmv_within_tile(state, orig, x << 2, y << 2, width, wpp_limit); } @@ -156,7 +157,7 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type, const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost, - int width, int height, int max_px_below_lcu) + int width, int height, int wpp_limit) { int n_points; int best_index = -1; @@ -274,7 +275,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi for (i = 0; i < n_points; i++) { vector2d_t *current = &pattern[pattern_type][i]; - if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width)) { + if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width, wpp_limit)) { continue; } @@ -286,7 +287,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, @@ -322,7 +323,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, const vector2d_t *orig, vector2d_t *mv, unsigned best_cost, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost, - int width, int height, int iSearchRange, int iRaster, int max_px_below_lcu) + int width, int height, int iSearchRange, int iRaster, int wpp_limit) { int i; int k; @@ -342,7 +343,7 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic for (k = -iSearchRange; k <= iSearchRange; k += iRaster) { vector2d_t current = { k, i }; - if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width)) { + if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width, wpp_limit)) { continue; } @@ -354,7 +355,7 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, @@ -405,7 +406,7 @@ static unsigned tz_search(const encoder_state_t * const state, int iDist; int best_dist = 0; unsigned best_index = num_cand; - int max_px_below_lcu = -1; + int wpp_limit = -1; int(*calc_mvd)(const encoder_state_t * const, int, int, int, int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS], @@ -415,28 +416,28 @@ static unsigned tz_search(const encoder_state_t * const state, } if (state->encoder_control->owf) { - max_px_below_lcu = LCU_WIDTH; + wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH; if (state->encoder_control->fme_level > 0) { // Fractional motion estimation can change the mv by at most 1 pixel. - max_px_below_lcu -= 1; + wpp_limit -= 1; } if (state->encoder_control->deblock_enable) { // Strong deblock filter modifies 3 pixels. - max_px_below_lcu -= 3; + wpp_limit -= 3; } } //step 1, compare (0,0) vector to predicted vectors // Check whatever input vector we got, unless its (0, 0) which will be checked later. - if ((mv.x || mv.y) && intmv_within_tile(state, orig, mv.x, mv.y, width)) + if ((mv.x || mv.y) && intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) { PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX); best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, - width, height, max_px_below_lcu); + width, height, -1); best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, @@ -454,7 +455,7 @@ static unsigned tz_search(const encoder_state_t * const state, if (merge_cand[i].dir == 3) continue; mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2; mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2; - if (!intmv_within_tile(state, orig, mv.x, mv.y, width)) { + if (!intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) { continue; } @@ -464,7 +465,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, @@ -492,7 +493,7 @@ static unsigned tz_search(const encoder_state_t * const state, for (iDist = 1; iDist <= iSearchRange; iDist *= 2) { best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist, - mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu); + mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit); } //step 3, raster scan @@ -501,7 +502,7 @@ static unsigned tz_search(const encoder_state_t * const state, best_dist = iRaster; best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand, - num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, max_px_below_lcu); + num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, wpp_limit); } //step 4 @@ -513,7 +514,7 @@ static unsigned tz_search(const encoder_state_t * const state, while (iDist > 0) { best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist, - mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu); + mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit); iDist = iDist >> 1; } @@ -525,7 +526,7 @@ static unsigned tz_search(const encoder_state_t * const state, for (iDist = 1; iDist <= iSearchRange; iDist *= 2) { best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist, - mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu); + mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit); } } @@ -594,7 +595,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, uint32_t best_bitcost = 0, bitcost; unsigned i; unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs. - int max_px_below_lcu = -1; + int wpp_limit = -1; int (*calc_mvd)(const encoder_state_t * const, int, int, int, int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS], @@ -603,16 +604,15 @@ static unsigned hexagon_search(const encoder_state_t * const state, calc_mvd = kvz_calc_mvd_cost_cabac; } - if (state->encoder_control->owf) { - max_px_below_lcu = LCU_WIDTH; + wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH; if (state->encoder_control->fme_level > 0) { // Fractional motion estimation can change the mv by at most 1 pixel. - max_px_below_lcu -= 1; + wpp_limit -= 1; } if (state->encoder_control->deblock_enable) { // Strong deblock filter modifies 3 pixels. - max_px_below_lcu -= 3; + wpp_limit -= 3; } } @@ -628,14 +628,14 @@ static unsigned hexagon_search(const encoder_state_t * const state, } if (!mv_in_merge_cand && - intmv_within_tile(state, orig, mv.x, mv.y, width)) + intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) { PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX); best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, - width, height, max_px_below_lcu); + width, height, -1); best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); best_bitcost = bitcost; best_index = num_cand; @@ -653,7 +653,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, if (merge_cand[i].dir == 3) continue; mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2; mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2; - if (!intmv_within_tile(state, orig, mv.x, mv.y, width)) { + if (!intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) { continue; } @@ -662,7 +662,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, @@ -689,7 +689,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, best_index = 0; for (i = 0; i < 7; ++i) { const vector2d_t *pattern = &large_hexbs[i]; - if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width)) { + if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, wpp_limit)) { continue; } @@ -699,7 +699,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, @@ -736,7 +736,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, // Iterate through the next 3 points. for (i = 0; i < 3; ++i) { const vector2d_t *offset = &large_hexbs[start + i]; - if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width)) { + if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, wpp_limit)) { continue; } @@ -746,7 +746,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, @@ -772,7 +772,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, // Do the final step of the search with a small pattern. for (i = 1; i < 5; ++i) { const vector2d_t *offset = &small_hexbs[i]; - if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width)) { + if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, wpp_limit)) { continue; } @@ -782,7 +782,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, - width, height, max_px_below_lcu); + width, height, -1); cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, @@ -906,6 +906,15 @@ static unsigned search_frac(const encoder_state_t * const state, { -1, -1 }, { 0, -1 }, { 1, -1 } }; + int wpp_limit = -1; + if (state->encoder_control->owf) { + wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH; + if (state->encoder_control->deblock_enable) { + // Strong deblock filter modifies 3 pixels. + wpp_limit -= 3; + } + } + //Set mv to halfpel precision vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 }; unsigned best_cost = UINT32_MAX; @@ -955,7 +964,7 @@ static unsigned search_frac(const encoder_state_t * const state, // Search halfpel positions around best integer mv for (i = 0; i < 9; ++i) { const vector2d_t *pattern = &square[i]; - if (!fracmv_within_tile(state, orig, (mv.x + pattern->x) << 1, (mv.y + pattern->y) << 1, width)) { + if (!fracmv_within_tile(state, orig, (mv.x + pattern->x) << 1, (mv.y + pattern->y) << 1, width, wpp_limit)) { continue; } @@ -995,7 +1004,7 @@ static unsigned search_frac(const encoder_state_t * const state, //Search quarterpel points around best halfpel mv for (i = 0; i < 9; ++i) { const vector2d_t *pattern = &square[i]; - if (!fracmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width)) { + if (!fracmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, wpp_limit)) { continue; } @@ -1132,8 +1141,8 @@ static void search_pu_inter_ref(const encoder_state_t * const state, break; } - assert(fracmv_within_tile(state, &orig, mv.x, mv.y, width)); - + assert(fracmv_within_tile(state, &orig, mv.x, mv.y, width, -1)); + if (state->encoder_control->cfg->fme_level > 0) { temp_cost = search_frac(state, width, height, @@ -1146,7 +1155,7 @@ static void search_pu_inter_ref(const encoder_state_t * const state, num_cand, ref_idx, &temp_bitcost); - assert(fracmv_within_tile(state, &orig, mv.x, mv.y, width)); + assert(fracmv_within_tile(state, &orig, mv.x, mv.y, width, -1)); } merged = 0;