Fix bug with OWF+FME+deblocking

Increases the MV safety margin of OWF from 2 to 3 when deblocking
is used and 4 when both deblocking and FME are used.

Fractional pixel motion estimation can move the vector one more pixel
down causing checksum error. This fixes that error by increasing the
OWF safety margin and changes the interface, so that different margin
can be used when FME or deblocking are not in use.
This commit is contained in:
Ari Koivula 2015-12-04 12:28:26 +02:00
parent f2d8cd4d64
commit c94707e6e8
2 changed files with 60 additions and 44 deletions

View file

@ -428,20 +428,15 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
* \returns * \returns
*/ */
unsigned kvz_image_calc_sad(const kvz_picture *pic, const kvz_picture *ref, int pic_x, int pic_y, int ref_x, int ref_y, unsigned kvz_image_calc_sad(const kvz_picture *pic, const kvz_picture *ref, int pic_x, int pic_y, int ref_x, int ref_y,
int block_width, int block_height, int max_lcu_below) { int block_width, int block_height, int max_px_below_lcu) {
assert(pic_x >= 0 && pic_x <= pic->width - block_width); assert(pic_x >= 0 && pic_x <= pic->width - block_width);
assert(pic_y >= 0 && pic_y <= pic->height - block_height); assert(pic_y >= 0 && pic_y <= pic->height - block_height);
// Check that we are not referencing pixels that are not final. // Check that we are not referencing pixels that are not final.
if (max_lcu_below >= 0) { if (max_px_below_lcu >= 0) {
// When SAO is off, row is considered reconstructed when the last LCU int next_lcu_row_px = ((pic_y >> LOG2_LCU_WIDTH) + 1) << LOG2_LCU_WIDTH;
// is done, although the bottom 2 pixels might still need deblocking. int px_below_lcu = ref_y + block_height - next_lcu_row_px;
// To work around this, add 2 luma pixels to the reach of the mv if (px_below_lcu > max_px_below_lcu) {
// in order to avoid referencing those possibly non-deblocked pixels.
int mv_lcu_row_reach = (ref_y + block_height - 1 + 2) / LCU_WIDTH;
int cur_lcu_row = pic_y / LCU_WIDTH;
if (mv_lcu_row_reach > cur_lcu_row + max_lcu_below) {
//printf("OOB %d %d -> %d\n", ref_y + block_height, pic_y, ref_y + block_height - pic_y);
return INT_MAX; return INT_MAX;
} }
} }

View file

@ -133,7 +133,7 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int
unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type, unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist, const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
int block_width, int max_lcu_below) int block_width, int max_px_below_lcu)
{ {
int n_points; int n_points;
int best_index = -1; int best_index = -1;
@ -259,7 +259,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -295,7 +295,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
const vector2d_t *orig, vector2d_t *mv, unsigned best_cost, const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
int block_width, int iSearchRange, int iRaster, int max_lcu_below) int block_width, int iSearchRange, int iRaster, int max_px_below_lcu)
{ {
int i; int i;
int k; int k;
@ -323,7 +323,7 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -375,7 +375,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
int iDist; int iDist;
int best_dist = 0; int best_dist = 0;
unsigned best_index = num_cand; unsigned best_index = num_cand;
int max_lcu_below = -1; int max_px_below_lcu = -1;
int(*calc_mvd)(const encoder_state_t * const, int, int, int, int(*calc_mvd)(const encoder_state_t * const, int, int, int,
int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS], int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -385,7 +385,15 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
} }
if (state->encoder_control->owf) { if (state->encoder_control->owf) {
max_lcu_below = 1; max_px_below_lcu = LCU_WIDTH;
if (state->encoder_control->fme_level > 0) {
// Fractional motion estimation can change the mv by at most 1 pixel.
max_px_below_lcu -= 1;
}
if (state->encoder_control->deblock_enable) {
// Strong deblock filter modifies 3 pixels.
max_px_below_lcu -= 3;
}
} }
//step 1, compare (0,0) vector to predicted vectors //step 1, compare (0,0) vector to predicted vectors
@ -398,7 +406,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost); best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -423,7 +431,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -451,7 +459,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
for (iDist = 1; iDist <= iSearchRange; iDist *= 2) for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
{ {
best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist, best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below); mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_px_below_lcu);
} }
//step 3, raster scan //step 3, raster scan
@ -460,7 +468,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
best_dist = iRaster; best_dist = iRaster;
best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand, best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand,
num_cand, ref_idx, &best_bitcost, block_width, iSearchRange, iRaster, max_lcu_below); num_cand, ref_idx, &best_bitcost, block_width, iSearchRange, iRaster, max_px_below_lcu);
} }
//step 4 //step 4
@ -472,7 +480,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
while (iDist > 0) while (iDist > 0)
{ {
best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist, best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below); mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_px_below_lcu);
iDist = iDist >> 1; iDist = iDist >> 1;
} }
@ -484,7 +492,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
for (iDist = 1; iDist <= iSearchRange; iDist *= 2) for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
{ {
best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist, best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below); mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_px_below_lcu);
} }
} }
@ -552,7 +560,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
uint32_t best_bitcost = 0, bitcost; uint32_t best_bitcost = 0, bitcost;
unsigned i; unsigned i;
unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs. unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
int max_lcu_below = -1; int max_px_below_lcu = -1;
int (*calc_mvd)(const encoder_state_t * const, int, int, int, int (*calc_mvd)(const encoder_state_t * const, int, int, int,
int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS], int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -563,7 +571,15 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
if (state->encoder_control->owf) { if (state->encoder_control->owf) {
max_lcu_below = 1; max_px_below_lcu = LCU_WIDTH;
if (state->encoder_control->fme_level > 0) {
// Fractional motion estimation can change the mv by at most 1 pixel.
max_px_below_lcu -= 1;
}
if (state->encoder_control->deblock_enable) {
// Strong deblock filter modifies 3 pixels.
max_px_below_lcu -= 3;
}
} }
// Check mv_in, if it's not in merge candidates. // Check mv_in, if it's not in merge candidates.
@ -583,7 +599,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
best_bitcost = bitcost; best_bitcost = bitcost;
best_index = num_cand; best_index = num_cand;
@ -607,7 +623,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -640,7 +656,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
@ -683,7 +699,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
@ -715,7 +731,7 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
block_width, block_width, max_lcu_below); block_width, block_width, max_px_below_lcu);
cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
@ -1095,11 +1111,17 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
get_mvd_cost = kvz_get_mvd_coding_cost_cabac; get_mvd_cost = kvz_get_mvd_coding_cost_cabac;
} }
int max_px_below_lcu = -1;
int max_lcu_below = -1;
if (state->encoder_control->owf) { if (state->encoder_control->owf) {
max_lcu_below = 1; max_px_below_lcu = LCU_WIDTH;
if (state->encoder_control->fme_level > 0) {
// Fractional motion estimation can change the mv by at most 1 pixel.
max_px_below_lcu -= 1;
}
if (state->encoder_control->deblock_enable) {
// Strong deblock filter modifies 3 pixels.
max_px_below_lcu -= 3;
}
} }
// Default to candidate 0 // Default to candidate 0
@ -1159,17 +1181,16 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
mv[1][1] = merge_cand[j].mv[1][1]; mv[1][1] = merge_cand[j].mv[1][1];
// Check boundaries when using owf to process multiple frames at the same time // Check boundaries when using owf to process multiple frames at the same time
if (max_lcu_below >= 0) { if (max_px_below_lcu >= 0) {
// When SAO is off, row is considered reconstructed when the last LCU // The following has been modified to fit testing two mv's, but it's
// is done, although the bottom 2 pixels might still need deblocking. // equivalent to: y + mv + block_height - next_lcu_row_px > max_px_below_lcu
// To work around this, add 2 luma pixels to the reach of the mv int next_lcu_row_px = ((y >> LOG2_LCU_WIDTH) + 1) << LOG2_LCU_WIDTH;
// in order to avoid referencing those possibly non-deblocked pixels. int block_height = LCU_WIDTH >> depth;
int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH; int max_mv = next_lcu_row_px - y - block_height + max_px_below_lcu;
int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH; int ceil_mv_l0 = ((mv[0][1] + 3) >> 2);
int cur_lcu_row = y / LCU_WIDTH; int ceil_mv_l1 = ((mv[1][1] + 3) >> 2);
if (mv_lcu_row_reach_1 > cur_lcu_row + max_lcu_below || mv_lcu_row_reach_2 > cur_lcu_row + max_lcu_below) {
continue; if (ceil_mv_l0 < max_mv || ceil_mv_l1 < max_mv) continue;
}
} }
kvz_inter_recon_lcu_bipred(state, state->global->ref->images[merge_cand[i].ref[0]], state->global->ref->images[merge_cand[j].ref[1]], x, y, LCU_WIDTH >> depth, mv, templcu); kvz_inter_recon_lcu_bipred(state, state->global->ref->images[merge_cand[i].ref[0]], state->global->ref->images[merge_cand[j].ref[1]], x, y, LCU_WIDTH >> depth, mv, templcu);