/***************************************************************************** * This file is part of Kvazaar HEVC encoder. * * Copyright (C) 2013-2015 Tampere University of Technology and others (see * COPYING file). * * Kvazaar is free software: you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the * Free Software Foundation; either version 2.1 of the License, or (at your * option) any later version. * * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with Kvazaar. If not, see . ****************************************************************************/ #include "search_inter.h" #include #include #include "cabac.h" #include "encoder.h" #include "image.h" #include "imagelist.h" #include "inter.h" #include "kvazaar.h" #include "rdo.h" #include "strategies/strategies-ipol.h" #include "strategies/strategies-picture.h" #include "videoframe.h" /** * \return True if referred block is within current tile. */ static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit) { if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_NONE) { return (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2)); }; int margin = 0; if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) { // Enforce a distance of 8 from any tile boundary. margin = 4 * 4; } // TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE. const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y }; // Check that both margin and wpp_limit constraints are satisfied. if (abs_mv.x >= margin && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) - margin && abs_mv.y >= margin && abs_mv.y + (height << 2) <= (state->tile->frame->height << 2) - margin && (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2))) { return true; } else { return false; } } static INLINE int get_wpp_limit(const encoder_state_t *state, const vector2d_t* orig) { const encoder_control_t *ctrl = state->encoder_control; if (ctrl->owf && ctrl->wpp) { // Limit motion vectors to the LCU-row below this row. // To avoid fractional pixel interpolation depending on things outside // this range, add a margin of 4 pixels. // - fme needs 4 pixels // - odd chroma interpolation needs 4 pixels int wpp_limit = 2 * LCU_WIDTH - 4 - orig->y % LCU_WIDTH; if (ctrl->deblock_enable && !ctrl->sao_enable) { // As a special case, when deblocking is enabled but SAO is not, we have // to avoid the possibility of interpolation filters reaching the // non-deblocked pixels. The deblocking for the horizontal edge on the // LCU boundary can reach 4 pixels. If SAO is enabled, this WPP-row // depends on the SAO job, which depends on the deblocking having // already been done. wpp_limit -= 4; } return wpp_limit; } else { return -1; } } /** * \return True if referred block is within current tile. */ static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit) { return fracmv_within_tile(state, orig, x << 2, y << 2, width, height, wpp_limit); } static unsigned get_ep_ex_golomb_bitcost(unsigned symbol) { // Calculate 2 * log2(symbol + 2) unsigned bins = 0; symbol += 2; if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; } if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; } if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; } if (symbol >= 1 << 1) { bins += 2; } // TODO: It might be a good idea to put a small slope on this function to // make sure any search function that follows the gradient heads towards // a smaller MVD, but that would require fractinal costs and bits being // used everywhere in inter search. // return num_bins + 0.001 * symbol; return bins; } /**Checks if mv is one of the merge candidates * \return true if found else return false */ static bool mv_in_merge(const inter_merge_cand_t* merge_cand, int16_t num_cand, const vector2d_t* mv) { for (int i = 0; i < num_cand; ++i) { if (merge_cand[i].dir == 3) continue; const vector2d_t merge_mv = { merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2, merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2 }; if (merge_mv.x == mv->x && merge_mv.y == mv->y) { return true; } } return false; } static unsigned select_starting_point(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state, const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref, int16_t mv_cand[2][2], int32_t ref_idx, unsigned best_cost, unsigned *best_index, uint32_t *best_bitcost, kvz_mvd_cost_func *calc_mvd){ // Go through candidates for (unsigned i = 0; i < num_cand; ++i) { if (merge_cand[i].dir == 3) continue; mv->x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2; mv->y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2; if (mv->x == 0 && mv->y == 0) continue; if (!intmv_within_tile(state, orig, mv->x, mv->y, width, height, wpp_limit)) { continue; } uint32_t bitcost = 0; unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y, width, height, -1); cost += calc_mvd(state, mv->x, mv->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; *best_index = i; *best_bitcost = bitcost; } } if (*best_index < num_cand) { mv->x = merge_cand[*best_index].mv[merge_cand[*best_index].dir - 1][0] >> 2; mv->y = merge_cand[*best_index].mv[merge_cand[*best_index].dir - 1][1] >> 2; } else if (*best_index == num_cand) { mv->x = mv_in_out->x >> 2; mv->y = mv_in_out->y >> 2; } else { mv->x = 0; mv->y = 0; } return best_cost; } static uint32_t get_mvd_coding_cost(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac) { unsigned bitcost = 0; const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) }; bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.x > 0); if (abs_mvd.x > 0) { bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.x > 1); if (abs_mvd.x > 1) { bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x - 2) << CTX_FRAC_BITS; } bitcost += CTX_FRAC_ONE_BIT; // sign } bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.y > 0); if (abs_mvd.y > 0) { bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.y > 1); if (abs_mvd.y > 1) { bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y - 2) << CTX_FRAC_BITS; } bitcost += CTX_FRAC_ONE_BIT; // sign } // Round and shift back to integer bits. return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; } static int calc_mvd_cost(encoder_state_t * const state, int x, int y, int mv_shift, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand,int32_t ref_idx, uint32_t *bitcost) { uint32_t temp_bitcost = 0; uint32_t merge_idx; int cand1_cost,cand2_cost; vector2d_t mvd_temp1, mvd_temp2; int8_t merged = 0; int8_t cur_mv_cand = 0; x <<= mv_shift; y <<= mv_shift; // Check every candidate to find a match for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) { if (merge_cand[merge_idx].dir == 3) continue; if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x && merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y && merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) { temp_bitcost += merge_idx; merged = 1; break; } } // Check mvd cost only if mv is not merged if(!merged) { mvd_temp1.x = x - mv_cand[0][0]; mvd_temp1.y = y - mv_cand[0][1]; cand1_cost = get_mvd_coding_cost(state, &mvd_temp1, &state->cabac); mvd_temp2.x = x - mv_cand[1][0]; mvd_temp2.y = y - mv_cand[1][1]; cand2_cost = get_mvd_coding_cost(state, &mvd_temp2, &state->cabac); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { cur_mv_cand = 1; } temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost; } *bitcost = temp_bitcost; return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5); } static bool early_terminate(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state, const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref, int16_t mv_cand[2][2], int32_t ref_idx, unsigned *best_cost, uint32_t *bitcost_out, uint32_t *best_bitcost, kvz_mvd_cost_func *calc_mvd) { static const vector2d_t small_hexbs[5] = { { 0, 0 }, { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 }, }; double multiplier = 1; // If early termination is set to fast set multiplier to 0.9 if (state->encoder_control->cfg->me_early_termination == KVZ_ME_EARLY_TERMINATION_SENSITIVE){ multiplier = 0.95; } const vector2d_t *offset; for (int k = 0; k < 2; ++k){ unsigned best_index = 0; for (int i = 1; i < 5; ++i) { offset = &small_hexbs[i]; if (!intmv_within_tile(state, orig, mv->x + offset->x, mv->y + offset->y, width, height, wpp_limit)) { continue; } unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + offset->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + offset->y, width, height, -1); unsigned bitcost; cost += calc_mvd(state, mv->x + offset->x, mv->y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < multiplier * *best_cost ) { *best_cost = cost; best_index = i; *best_bitcost = bitcost; } } // Adjust the movement vector mv->x += small_hexbs[best_index].x; mv->y += small_hexbs[best_index].y; // if best match is at center we stop the search if (best_index == 0){ // Return final movement vector in quarter-pixel precision. mv_in_out->x = mv->x << 2; mv_in_out->y = mv->y << 2; *bitcost_out = *best_bitcost; return true; } } return false; } unsigned kvz_tz_pattern_search(encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type, const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost, int width, int height, int wpp_limit) { int n_points; int best_index = -1; int i; vector2d_t mv_best = { 0, 0 }; kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } assert(pattern_type < 4); //implemented search patterns vector2d_t pattern[4][8] = { //diamond (8 points) //[ ][ ][ ][ ][1][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][8][ ][ ][ ][5][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[4][ ][ ][ ][o][ ][ ][ ][2] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][7][ ][ ][ ][6][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][3][ ][ ][ ][ ] { { 0, iDist }, { iDist, 0 }, { 0, -iDist }, { -iDist, 0 }, { iDist / 2, iDist / 2 }, { iDist / 2, -iDist / 2 }, { -iDist / 2, -iDist / 2 }, { -iDist / 2, iDist / 2 } }, //square (8 points) //[8][ ][ ][ ][1][ ][ ][ ][2] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[7][ ][ ][ ][o][ ][ ][ ][3] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[6][ ][ ][ ][5][ ][ ][ ][4] { { 0, iDist }, { iDist, iDist }, { iDist, 0 }, { iDist, -iDist }, { 0, -iDist }, { -iDist, -iDist }, { -iDist, 0 }, { -iDist, iDist } }, //octagon (8 points) //[ ][ ][5][ ][ ][ ][1][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][2] //[4][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][o][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[8][ ][ ][ ][ ][ ][ ][ ][6] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][7][ ][ ][ ][3][ ][ ] { { iDist / 2, iDist }, { iDist, iDist / 2 }, { iDist / 2, -iDist }, { -iDist, iDist / 2 }, { -iDist / 2, iDist }, { iDist, -iDist / 2 }, { -iDist / 2, -iDist }, { -iDist, -iDist / 2 } }, //hexagon (6 points) //[ ][ ][5][ ][ ][ ][1][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[4][ ][ ][ ][o][ ][ ][ ][2] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][ ][ ][ ][ ][ ][ ][ ] //[ ][ ][6][ ][ ][ ][3][ ][ ] { { iDist / 2, iDist }, { iDist, 0 }, { iDist / 2, -iDist }, { -iDist, 0 }, { iDist / 2, iDist }, { -iDist / 2, -iDist }, { 0, 0 }, { 0, 0 } } }; //set the number of points to be checked if (iDist == 1) { switch (pattern_type) { case 0: n_points = 4; break; case 2: n_points = 4; break; case 3: n_points = 4; break; default: n_points = 8; break; }; } else { switch (pattern_type) { case 3: n_points = 6; break; default: n_points = 8; break; }; } //compute SAD values for all chosen points for (i = 0; i < n_points; i++) { vector2d_t *current = &pattern[pattern_type][i]; if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width, height, wpp_limit)) { continue; } unsigned cost; uint32_t bitcost; { cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y, width, height, -1); cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); } if (cost < best_cost) { best_cost = cost; *best_bitcost = bitcost; best_index = i; } } if (best_index >= 0) { mv_best = pattern[pattern_type][best_index]; *best_dist = iDist; } mv->x += mv_best.x; mv->y += mv_best.y; return best_cost; } unsigned kvz_tz_raster_search(encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, const vector2d_t *orig, vector2d_t *mv, unsigned best_cost, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost, int width, int height, int iSearchRange, int iRaster, int wpp_limit) { int i; int k; vector2d_t mv_best = { 0, 0 }; kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } //compute SAD values for every point in the iRaster downsampled version of the current search area for (i = iSearchRange; i >= -iSearchRange; i -= iRaster) { for (k = -iSearchRange; k <= iSearchRange; k += iRaster) { vector2d_t current = { k, i }; if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width, height, wpp_limit)) { continue; } unsigned cost; uint32_t bitcost; { cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i, width, height, -1); cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); } if (cost < best_cost) { best_cost = cost; *best_bitcost = bitcost; mv_best = current; } } } mv->x += mv_best.x; mv->y += mv_best.y; return best_cost; } static unsigned tz_search(encoder_state_t * const state, unsigned width, unsigned height, const kvz_picture *pic, const kvz_picture *ref, const vector2d_t *orig, vector2d_t *mv_in_out, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) { //TZ parameters const int iSearchRange = 96; // search range for each stage const int iRaster = 5; // search distance limit and downsampling factor for step 3 const unsigned step2_type = 0; // search patterns for steps 2 and 4 const unsigned step4_type = 0; const bool bRasterRefinementEnable = true; // enable step 4 mode 1 const bool bStarRefinementEnable = false; // enable step 4 mode 2 (only one mode will be executed) vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 }; unsigned best_cost = UINT32_MAX; uint32_t best_bitcost = 0; int iDist; int best_dist = 0; unsigned best_index = num_cand + 1; int wpp_limit = get_wpp_limit(state, orig); kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) { best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, width, height, -1); best_cost += calc_mvd(state, 0, 0, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost); best_index = num_cand + 1; } // Check mv_in if it's not one of the merge candidates. if (!mv_in_merge(merge_cand, num_cand, &mv) && intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit)) { unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, width, height, -1); unsigned bitcost; cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_index = num_cand; best_bitcost = bitcost; } } // Select starting point from among merge candidates. These should include // both mv_cand vectors and (0, 0). best_cost = select_starting_point(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit, pic, ref, mv_cand, ref_idx, best_cost, &best_index, &best_bitcost, calc_mvd); // Check if we should stop search if (state->encoder_control->cfg->me_early_termination){ if (early_terminate(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit, pic, ref, mv_cand, ref_idx, &best_cost, bitcost_out, &best_bitcost, calc_mvd)) return best_cost; } //step 2, grid search for (iDist = 1; iDist <= iSearchRange; iDist *= 2) { best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit); } //step 3, raster scan if (best_dist > iRaster) { best_dist = iRaster; best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, wpp_limit); } //step 4 //raster refinement if (bRasterRefinementEnable && best_dist > 0) { iDist = best_dist >> 1; while (iDist > 0) { best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit); iDist = iDist >> 1; } } //star refinement (repeat step 2 for the current starting point) if (bStarRefinementEnable && best_dist > 0) { for (iDist = 1; iDist <= iSearchRange; iDist *= 2) { best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit); } } mv.x = mv.x << 2; mv.y = mv.y << 2; *mv_in_out = mv; *bitcost_out = best_bitcost; return best_cost; } /** * \brief Do motion search using the HEXBS algorithm. * * \param width width of the block to search * \param height height of the block to search * \param pic Picture motion vector is searched for. * \param ref Picture motion vector is searched from. * \param orig Top left corner of the searched for block. * \param mv_in_out Predicted mv in and best out. Quarter pixel precision. * * \returns Cost of the motion vector. * * Motion vector is searched by first searching iteratively with the large * hexagon pattern until the best match is at the center of the hexagon. * As a final step a smaller hexagon is used to check the adjacent pixels. * * If a non 0,0 predicted motion vector predictor is given as mv_in_out, * the 0,0 vector is also tried. This is hoped to help in the case where * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. */ static unsigned hexagon_search(encoder_state_t * const state, unsigned width, unsigned height, const kvz_picture *pic, const kvz_picture *ref, const vector2d_t *orig, vector2d_t *mv_in_out, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new // points to search. // 6--1,7 // / \ =) // 5 0 2,8 // \ / // 4---3 static const vector2d_t large_hexbs[9] = { { 0, 0 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 }, { -1, -2 }, { 1, -2 }, { 2, 0 } }; // This is used as the last step of the hexagon search. // 1 // 2 0 3 // 4 static const vector2d_t small_hexbs[5] = { { 0, 0 }, { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 } }; vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 }; unsigned best_cost = UINT32_MAX; uint32_t best_bitcost = 0, bitcost; unsigned i; // Current best index, either to merge_cands, large_hebx or small_hexbs. unsigned best_index = num_cand + 1; int wpp_limit = get_wpp_limit(state, orig); kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) { best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, width, height, -1); best_cost += calc_mvd(state, 0, 0, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); best_bitcost = bitcost; best_index = num_cand + 1; } // Check mv_in if it's not one of the merge candidates. if (!mv_in_merge(merge_cand, num_cand, &mv) && intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit)) { unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y, width, height, -1); cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_index = num_cand; best_bitcost = bitcost; } } // Select starting point from among merge candidates. These should include // both mv_cand vectors and (0, 0). best_cost = select_starting_point(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit, pic, ref, mv_cand, ref_idx, best_cost, &best_index, &best_bitcost, calc_mvd); // Check if we should stop search if (state->encoder_control->cfg->me_early_termination){ if (early_terminate(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit, pic, ref, mv_cand, ref_idx, &best_cost, bitcost_out, &best_bitcost, calc_mvd)) return best_cost; } // Search the initial 7 points of the hexagon. best_index = 0; for (i = 0; i < 7; ++i) { const vector2d_t *pattern = &large_hexbs[i]; if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, height, wpp_limit)) { continue; } unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, width, height, -1); cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_index = i; best_bitcost = bitcost; } } // Iteratively search the 3 new points around the best match, until the best // match is in the center. while (best_index != 0) { unsigned start; // Starting point of the 3 offsets to be searched. if (best_index == 1) { start = 6; } else if (best_index == 8) { start = 1; } else { start = best_index - 1; } // Move the center to the best match. mv.x += large_hexbs[best_index].x; mv.y += large_hexbs[best_index].y; best_index = 0; // Iterate through the next 3 points. for (i = 0; i < 3; ++i) { const vector2d_t *offset = &large_hexbs[start + i]; if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, height, wpp_limit)) { continue; } unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, width, height, -1); cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_index = start + i; best_bitcost = bitcost; } } } // Move the center to the best match. mv.x += large_hexbs[best_index].x; mv.y += large_hexbs[best_index].y; best_index = 0; // Do the final step of the search with a small pattern. for (i = 1; i < 5; ++i) { const vector2d_t *offset = &small_hexbs[i]; if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, height, wpp_limit)) { continue; } unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, width, height, -1); cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost > 0 && cost < best_cost) { best_cost = cost; best_index = i; best_bitcost = bitcost; } } // Adjust the movement vector according to the final best match. mv.x += small_hexbs[best_index].x; mv.y += small_hexbs[best_index].y; // Return final movement vector in quarter-pixel precision. mv_in_out->x = mv.x << 2; mv_in_out->y = mv.y << 2; *bitcost_out = best_bitcost; return best_cost; } static unsigned search_mv_full(encoder_state_t * const state, unsigned width, unsigned height, const kvz_picture *pic, const kvz_picture *ref, const vector2d_t *orig, vector2d_t *mv_in_out, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, const int32_t search_range, uint32_t *bitcost_out) { vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 }; vector2d_t best_mv = { 0, 0 }; unsigned best_cost = UINT32_MAX; uint32_t best_bitcost = 0, bitcost; int wpp_limit = get_wpp_limit(state, orig); kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) { vector2d_t min_mv = { 0 - search_range, 0 - search_range }; vector2d_t max_mv = { 0 + search_range, 0 + search_range }; for (int y = min_mv.y; y <= max_mv.y; ++y) { for (int x = min_mv.x; x <= max_mv.x; ++x) { if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) { continue; } unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, orig->x + x, orig->y + y, width, height, -1); cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_bitcost = bitcost; best_mv.x = x; best_mv.y = y; } } } } // Check mv_in if it's not one of the merge candidates. if (!mv_in_merge(merge_cand, num_cand, &mv) && intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit)) { vector2d_t min_mv = { mv.x - search_range, mv.y - search_range }; vector2d_t max_mv = { mv.x + search_range, mv.y + search_range }; for (int y = min_mv.y; y <= max_mv.y; ++y) { for (int x = min_mv.x; x <= max_mv.x; ++x) { if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) { continue; } unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, orig->x + x, orig->y + y, width, height, -1); cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_bitcost = bitcost; best_mv.x = x; best_mv.y = y; } } } } // Select starting point from among merge candidates. These should include // both mv_cand vectors and (0, 0). for (int i = 0; i < num_cand; ++i) { if (merge_cand[i].dir == 3) continue; mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2; mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2; // Ignore 0-vector because it has already been checked. if (mv.x == 0 && mv.y == 0) continue; vector2d_t min_mv = { mv.x - search_range, mv.y - search_range }; vector2d_t max_mv = { mv.x + search_range, mv.y + search_range }; for (int y = min_mv.y; y <= max_mv.y; ++y) { for (int x = min_mv.x; x <= max_mv.x; ++x) { if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) { continue; } // Avoid calculating the same points over and over again. bool already_tested = false; for (int j = -1; j < i; ++j) { int xx = 0; int yy = 0; if (j >= 0) { if (merge_cand[j].dir == 3) continue; xx = merge_cand[j].mv[merge_cand[j].dir - 1][0] >> 2; yy = merge_cand[j].mv[merge_cand[j].dir - 1][1] >> 2; } if (x >= xx - search_range && x <= xx + search_range && y >= yy - search_range && y <= yy + search_range) { already_tested = true; x = xx + search_range; break; } } if (already_tested) continue; unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y, orig->x + x, orig->y + y, width, height, -1); cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; best_bitcost = bitcost; best_mv.x = x; best_mv.y = y; } } } } mv_in_out->x = best_mv.x << 2; mv_in_out->y = best_mv.y << 2; *bitcost_out = best_bitcost; return best_cost; } /** * \brief Do fractional motion estimation * * \param width width of the block * \param height height of the block * \param pic Picture motion vector is searched for. * \param ref Picture motion vector is searched from. * \param orig Top left corner of the searched for block. * \param mv_in_out Predicted mv in and best out. Quarter pixel precision. * * \returns Cost of the motion vector. * * Algoritm first searches 1/2-pel positions around integer mv and after best match is found, * refines the search by searching best 1/4-pel postion around best 1/2-pel position. */ static unsigned search_frac(encoder_state_t * const state, unsigned width, unsigned height, const kvz_picture *pic, const kvz_picture *ref, const vector2d_t *orig, vector2d_t *mv_in_out, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) { // Map indexes to relative coordinates in the following way: // 5 3 6 // 1 0 2 // 7 4 8 static const vector2d_t square[9] = { { 0, 0 }, { -1, 0 }, { 1, 0 }, { 0, -1 }, { 0, 1 }, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; int wpp_limit = get_wpp_limit(state, orig); //Set mv to halfpel precision vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 }; unsigned best_cost = UINT32_MAX; uint32_t best_bitcost = 0; uint32_t bitcosts[4] = { 0 }; unsigned i; unsigned best_index = 0; unsigned costs[4] = { 0 }; kvz_extended_block src = { 0, 0, 0, 0 }; // Buffers for interpolated fractional pixels one // for each position excluding the integer position. // Has one extra column on left and row on top because // samples are used also from those integer pixels when // searching positions to the left and up. frac_search_block fracpel_blocks[15]; kvz_pixel *hpel_pos[8]; // Horizontal hpel positions hpel_pos[0] = fracpel_blocks[HPEL_POS_HOR] + (LCU_WIDTH + 1); hpel_pos[1] = fracpel_blocks[HPEL_POS_HOR] + (LCU_WIDTH + 1) + 1; // Vertical hpel positions hpel_pos[2] = fracpel_blocks[HPEL_POS_VER] + 1; hpel_pos[3] = fracpel_blocks[HPEL_POS_VER] + (LCU_WIDTH + 1) + 1; // Diagonal hpel positions hpel_pos[4] = fracpel_blocks[HPEL_POS_DIA]; hpel_pos[5] = fracpel_blocks[HPEL_POS_DIA] + 1; hpel_pos[6] = fracpel_blocks[HPEL_POS_DIA] + (LCU_WIDTH + 1); hpel_pos[7] = fracpel_blocks[HPEL_POS_DIA] + (LCU_WIDTH + 1) + 1; int fme_level = state->encoder_control->fme_level; kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } kvz_get_extended_block(orig->x, orig->y, mv.x-1, mv.y-1, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH, ref->y, ref->width, ref->height, FILTER_SIZE, width+1, height+1, &src); kvz_filter_frac_blocks_luma(state->encoder_control, src.orig_topleft, src.stride, width, height, fracpel_blocks, fme_level); kvz_pixel tmp_pic[LCU_WIDTH*LCU_WIDTH]; kvz_pixels_blit(pic->y + orig->y * pic->stride + orig->x, tmp_pic, width, height, pic->stride, width); // Search integer position costs[0] = kvz_satd_any_size(width, height, tmp_pic, width, src.orig_topleft + src.stride + 1, src.stride); costs[0] += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[0]); best_cost = costs[0]; best_bitcost = bitcosts[0]; int last_hpel_index = (fme_level == 1) ? 4 : 8; //Set mv to half-pixel precision mv.x <<= 1; mv.y <<= 1; // Search halfpel positions around best integer mv for (i = 1; i <= last_hpel_index; i+=4) { const vector2d_t *pattern[4] = { &square[i], &square[i + 1], &square[i + 2], &square[i + 3] }; int8_t within_tile[4] = { fracmv_within_tile(state, orig, (mv.x + pattern[0]->x) << 1, (mv.y + pattern[0]->y) << 1, width, height, wpp_limit), fracmv_within_tile(state, orig, (mv.x + pattern[1]->x) << 1, (mv.y + pattern[1]->y) << 1, width, height, wpp_limit), fracmv_within_tile(state, orig, (mv.x + pattern[2]->x) << 1, (mv.y + pattern[2]->y) << 1, width, height, wpp_limit), fracmv_within_tile(state, orig, (mv.x + pattern[3]->x) << 1, (mv.y + pattern[3]->y) << 1, width, height, wpp_limit), }; int hpel_strides[4] = { (LCU_WIDTH + 1), (LCU_WIDTH + 1), (LCU_WIDTH + 1), (LCU_WIDTH + 1) }; kvz_satd_any_size_quad(width, height, (const kvz_pixel**)(hpel_pos + i - 1), hpel_strides, tmp_pic, width, 4, costs, within_tile); costs[0] += calc_mvd(state, mv.x + pattern[0]->x, mv.y + pattern[0]->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[0]); costs[1] += calc_mvd(state, mv.x + pattern[1]->x, mv.y + pattern[1]->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[1]); costs[2] += calc_mvd(state, mv.x + pattern[2]->x, mv.y + pattern[2]->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[2]); costs[3] += calc_mvd(state, mv.x + pattern[3]->x, mv.y + pattern[3]->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[3]); for (int j = 0; j < 4; ++j) { if (within_tile[j] && costs[j] < best_cost) { best_cost = costs[j]; best_index = i + j; best_bitcost = bitcosts[j]; } } } unsigned int best_hpel_index = best_index; // Move search to best_index mv.x += square[best_index].x; mv.y += square[best_index].y; //Set mv to quarterpel precision mv.x <<= 1; mv.y <<= 1; if (fme_level >= 3) { best_index = 0; int last_qpel_index = (fme_level == 3) ? 4 : 8; //Search quarterpel points around best halfpel mv for (i = 1; i <= last_qpel_index; i += 4) { const vector2d_t *pattern[4] = { &square[i], &square[i + 1], &square[i + 2], &square[i + 3] }; int8_t within_tile[4] = { fracmv_within_tile(state, orig, (mv.x + pattern[0]->x), (mv.y + pattern[0]->y), width, height, wpp_limit), fracmv_within_tile(state, orig, (mv.x + pattern[1]->x), (mv.y + pattern[1]->y), width, height, wpp_limit), fracmv_within_tile(state, orig, (mv.x + pattern[2]->x), (mv.y + pattern[2]->y), width, height, wpp_limit), fracmv_within_tile(state, orig, (mv.x + pattern[3]->x), (mv.y + pattern[3]->y), width, height, wpp_limit), }; int qpel_indices[4] = { 0 }; int int_offset_x[4] = { 0 }; int int_offset_y[4] = { 0 }; for (int j = 0; j < 4; ++j) { int hpel_offset_x = square[best_hpel_index].x; int hpel_offset_y = square[best_hpel_index].y; int qpel_offset_x = 2 * hpel_offset_x + pattern[j]->x; int qpel_offset_y = 2 * hpel_offset_y + pattern[j]->y; unsigned qpel_filter_x = (qpel_offset_x + 4) % 4; unsigned qpel_filter_y = (qpel_offset_y + 4) % 4; // The first value (-1) is for the integer position and // it will not be used int filters_to_block_idx[4][4] = { { -1, 3, 0, 4 }, { 7, 11, 8, 12 }, { 1, 5, 2, 6 }, { 9, 13, 10, 14 } }; qpel_indices[j] = filters_to_block_idx[qpel_filter_y][qpel_filter_x]; // Select values filtered from correct integer samples int_offset_x[j] = qpel_offset_x >= 0; int_offset_y[j] = qpel_offset_y >= 0; } kvz_pixel *qpel_pos[4] = { fracpel_blocks[qpel_indices[0]] + int_offset_y[0] * (LCU_WIDTH + 1) + int_offset_x[0], fracpel_blocks[qpel_indices[1]] + int_offset_y[1] * (LCU_WIDTH + 1) + int_offset_x[1], fracpel_blocks[qpel_indices[2]] + int_offset_y[2] * (LCU_WIDTH + 1) + int_offset_x[2], fracpel_blocks[qpel_indices[3]] + int_offset_y[3] * (LCU_WIDTH + 1) + int_offset_x[3] }; int qpel_strides[4] = { (LCU_WIDTH + 1), (LCU_WIDTH + 1), (LCU_WIDTH + 1), (LCU_WIDTH + 1) }; kvz_satd_any_size_quad(width, height, (const kvz_pixel**)qpel_pos, qpel_strides, tmp_pic, width, 4, costs, within_tile); costs[0] += calc_mvd(state, mv.x + pattern[0]->x, mv.y + pattern[0]->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[0]); costs[1] += calc_mvd(state, mv.x + pattern[1]->x, mv.y + pattern[1]->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[1]); costs[2] += calc_mvd(state, mv.x + pattern[2]->x, mv.y + pattern[2]->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[2]); costs[3] += calc_mvd(state, mv.x + pattern[3]->x, mv.y + pattern[3]->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts[3]); for (int j = 0; j < 4; ++j) { if (within_tile[j] && costs[j] < best_cost) { best_cost = costs[j]; best_index = i + j; best_bitcost = bitcosts[j]; } } } //Set mv to best final best match mv.x += square[best_index].x; mv.y += square[best_index].y; } mv_in_out->x = mv.x; mv_in_out->y = mv.y; *bitcost_out = best_bitcost; if (src.malloc_used) free(src.buffer); return best_cost; } /** * \brief Perform inter search for a single reference frame. */ static void search_pu_inter_ref(encoder_state_t * const state, int x, int y, int width, int height, int depth, lcu_t *lcu, cu_info_t *cur_cu, int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, unsigned ref_idx, uint32_t(*get_mvd_cost)(encoder_state_t * const, vector2d_t *, const cabac_data_t*), double *inter_cost, uint32_t *inter_bitcost) { const videoframe_t * const frame = state->tile->frame; kvz_picture *ref_image = state->frame->ref->images[ref_idx]; const vector2d_t orig = { x, y }; uint32_t temp_bitcost = 0; uint32_t temp_cost = 0; int32_t merged = 0; uint8_t cu_mv_cand = 0; int8_t merge_idx = 0; int8_t ref_list = state->frame->refmap[ref_idx].list-1; int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; // Get MV candidates cur_cu->inter.mv_ref[ref_list] = ref_idx; kvz_inter_get_mv_cand(state, x, y, width, height, mv_cand, cur_cu, lcu, ref_list); cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; vector2d_t mv = { 0, 0 }; { // Take starting point for MV search from previous frame. // When temporal motion vector candidates are added, there is probably // no point to this anymore, but for now it helps. const vector2d_t tile_top_left_corner = { (state->tile->lcu_offset_x << LOG2_LCU_WIDTH), (state->tile->lcu_offset_y << LOG2_LCU_WIDTH) }; const int mid_x = tile_top_left_corner.x + x + (width >> 1); const int mid_y = tile_top_left_corner.y + y + (height >> 1); const cu_array_t* ref_array = state->frame->ref->cu_arrays[ref_idx]; const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); if (ref_cu->type == CU_INTER) { if (ref_cu->inter.mv_dir & 1) { mv.x = ref_cu->inter.mv[0][0]; mv.y = ref_cu->inter.mv[0][1]; } else { mv.x = ref_cu->inter.mv[1][0]; mv.y = ref_cu->inter.mv[1][1]; } } } int search_range = 32; switch (state->encoder_control->cfg->ime_algorithm) { case KVZ_IME_FULL64: search_range = 64; break; case KVZ_IME_FULL32: search_range = 32; break; case KVZ_IME_FULL16: search_range = 16; break; case KVZ_IME_FULL8: search_range = 8; break; default: break; } switch (state->encoder_control->cfg->ime_algorithm) { case KVZ_IME_TZ: temp_cost += tz_search(state, width, height, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost); break; case KVZ_IME_FULL64: case KVZ_IME_FULL32: case KVZ_IME_FULL16: case KVZ_IME_FULL8: case KVZ_IME_FULL: temp_cost += search_mv_full(state, width, height, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, search_range, &temp_bitcost); break; default: temp_cost += hexagon_search(state, width, height, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost); break; } if (state->encoder_control->cfg->fme_level > 0 && temp_cost < *inter_cost) { temp_cost = search_frac(state, width, height, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost); } merged = 0; // Check every candidate to find a match for(merge_idx = 0; merge_idx < num_cand; merge_idx++) { if (merge_cand[merge_idx].dir != 3 && merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == mv.x && merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == mv.y && (uint32_t)merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) { merged = 1; break; } } // Only check when candidates are different if (!merged && (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) { vector2d_t mvd_temp1, mvd_temp2; int cand1_cost,cand2_cost; mvd_temp1.x = mv.x - mv_cand[0][0]; mvd_temp1.y = mv.y - mv_cand[0][1]; cand1_cost = get_mvd_cost(state, &mvd_temp1, &state->cabac); mvd_temp2.x = mv.x - mv_cand[1][0]; mvd_temp2.y = mv.y - mv_cand[1][1]; cand2_cost = get_mvd_cost(state, &mvd_temp2, &state->cabac); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { cu_mv_cand = 1; } } if (temp_cost < *inter_cost) { // Map reference index to L0/L1 pictures cur_cu->inter.mv_dir = ref_list+1; uint8_t mv_ref_coded = state->frame->refmap[ref_idx].idx; cur_cu->merged = merged; cur_cu->merge_idx = merge_idx; cur_cu->inter.mv_ref[ref_list] = ref_idx; cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); *inter_cost = temp_cost; *inter_bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded; } } /** * \brief Update PU to have best modes at this depth. * * \param state encoder state * \param x_cu x-coordinate of the containing CU * \param y_cu y-coordinate of the containing CU * \param depth depth of the CU in the quadtree * \param part_mode partition mode of the CU * \param i_pu index of the PU in the CU * \param lcu containing LCU * * \param inter_cost Return inter cost of the best mode * \param inter_bitcost Return inter bitcost of the best mode */ static void search_pu_inter(encoder_state_t * const state, int x_cu, int y_cu, int depth, part_mode_t part_mode, int i_pu, lcu_t *lcu, double *inter_cost, uint32_t *inter_bitcost) { *inter_cost = MAX_INT; *inter_bitcost = MAX_INT; const videoframe_t * const frame = state->tile->frame; const int width_cu = LCU_WIDTH >> depth; const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); const int width = PU_GET_W(part_mode, width_cu, i_pu); const int height = PU_GET_H(part_mode, width_cu, i_pu); // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and // nRx2N partitions. const bool merge_a1 = i_pu == 0 || width >= height; // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and // 2NxnD partitions. const bool merge_b1 = i_pu == 0 || width <= height; const int x_local = SUB_SCU(x); const int y_local = SUB_SCU(y); cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); int16_t mv_cand[2][2]; // Search for merge mode candidate inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS]; // Get list of candidates int16_t num_cand = kvz_inter_get_merge_cand(state, x, y, width, height, merge_a1, merge_b1, merge_cand, lcu); uint32_t(*get_mvd_cost)(encoder_state_t * const state, vector2d_t *, const cabac_data_t*) = get_mvd_coding_cost; if (state->encoder_control->cfg->mv_rdo) { get_mvd_cost = kvz_get_mvd_coding_cost_cabac; } // Default to candidate 0 CU_SET_MV_CAND(cur_cu, 0, 0); CU_SET_MV_CAND(cur_cu, 1, 0); uint32_t ref_idx; for (ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { search_pu_inter_ref(state, x, y, width, height, depth, lcu, cur_cu, mv_cand, merge_cand, num_cand, ref_idx, get_mvd_cost, inter_cost, inter_bitcost); } // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && state->encoder_control->cfg->bipred && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred if (can_use_bipred) { lcu_t *templcu = MALLOC(lcu_t, 1); unsigned cu_width = LCU_WIDTH >> depth; static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }; static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }; const unsigned num_cand_pairs = MIN(num_cand * (num_cand - 1), 12); kvz_mvd_cost_func *calc_mvd = calc_mvd_cost; if (state->encoder_control->cfg->mv_rdo) { calc_mvd = kvz_calc_mvd_cost_cabac; } for (int32_t idx = 0; idx < num_cand_pairs; idx++) { uint8_t i = priorityList0[idx]; uint8_t j = priorityList1[idx]; if (i >= num_cand || j >= num_cand) break; // Find one L0 and L1 candidate according to the priority list if ((merge_cand[i].dir & 0x1) && (merge_cand[j].dir & 0x2)) { if (merge_cand[i].ref[0] != merge_cand[j].ref[1] || merge_cand[i].mv[0][0] != merge_cand[j].mv[1][0] || merge_cand[i].mv[0][1] != merge_cand[j].mv[1][1]) { uint32_t bitcost[2]; uint32_t cost = 0; int8_t cu_mv_cand = 0; int16_t mv[2][2]; kvz_pixel tmp_block[64 * 64]; kvz_pixel tmp_pic[64 * 64]; // Force L0 and L1 references if (state->frame->refmap[merge_cand[i].ref[0]].list == 2 || state->frame->refmap[merge_cand[j].ref[1]].list == 1) continue; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; { // Don't try merge candidates that don't satisfy mv constraints. vector2d_t orig = { x, y }; if (!fracmv_within_tile(state, &orig, mv[0][0], mv[0][1], width, height, -1) || !fracmv_within_tile(state, &orig, mv[1][0], mv[1][1], width, height, -1)) { continue; } } kvz_inter_recon_lcu_bipred(state, state->frame->ref->images[merge_cand[i].ref[0]], state->frame->ref->images[merge_cand[j].ref[1]], x, y, width, height, mv, templcu); for (int ypos = 0; ypos < height; ++ypos) { int dst_y = ypos * width; for (int xpos = 0; xpos < width; ++xpos) { tmp_block[dst_y + xpos] = templcu->rec.y[ SUB_SCU(y + ypos) * LCU_WIDTH + SUB_SCU(x + xpos)]; tmp_pic[dst_y + xpos] = frame->source->y[x + xpos + (y + ypos)*frame->source->width]; } } cost = kvz_satd_any_size(cu_width, cu_width, tmp_pic, cu_width, tmp_block, cu_width); cost += calc_mvd(state, merge_cand[i].mv[0][0], merge_cand[i].mv[0][1], 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[0]); cost += calc_mvd(state, merge_cand[i].mv[1][0], merge_cand[i].mv[1][1], 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[1]); if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; uint8_t mv_ref_coded[2] = { state->frame->refmap[merge_cand[i].ref[0]].idx, state->frame->refmap[merge_cand[j].ref[1]].idx }; cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; cur_cu->merged = 0; // Check every candidate to find a match for(int merge_idx = 0; merge_idx < num_cand; merge_idx++) { if ( merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) { cur_cu->merged = 1; cur_cu->merge_idx = merge_idx; break; } } // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { cu_mv_cand = 0; kvz_inter_get_mv_cand(state, x, y, width, height, mv_cand, cur_cu, lcu, reflist); if ((mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) { vector2d_t mvd_temp1, mvd_temp2; int cand1_cost, cand2_cost; mvd_temp1.x = cur_cu->inter.mv[reflist][0] - mv_cand[0][0]; mvd_temp1.y = cur_cu->inter.mv[reflist][1] - mv_cand[0][1]; cand1_cost = get_mvd_cost(state, &mvd_temp1, (cabac_data_t*)&state->cabac); mvd_temp2.x = cur_cu->inter.mv[reflist][0] - mv_cand[1][0]; mvd_temp2.y = cur_cu->inter.mv[reflist][1] - mv_cand[1][1]; cand2_cost = get_mvd_cost(state, &mvd_temp2, (cabac_data_t*)&state->cabac); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { cu_mv_cand = 1; } } CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); } *inter_cost = cost; *inter_bitcost = bitcost[0] + bitcost[1] + cur_cu->inter.mv_dir - 1 + mv_ref_coded[0] + mv_ref_coded[1]; } } } } FREE_POINTER(templcu); } if (*inter_cost < INT_MAX) { const vector2d_t orig = { x, y }; if (cur_cu->inter.mv_dir == 1) { assert(fracmv_within_tile(state, &orig, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1], width, height, -1)); } } } /** * \brief Update CU to have best modes at this depth. * * Only searches the 2Nx2N partition mode. * * \param state encoder state * \param x x-coordinate of the CU * \param y y-coordinate of the CU * \param depth depth of the CU in the quadtree * \param lcu containing LCU * * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, uint32_t *inter_bitcost) { search_pu_inter(state, x, y, depth, SIZE_2Nx2N, 0, lcu, inter_cost, inter_bitcost); } /** * \brief Update CU to have best modes at this depth. * * Only searches the given partition mode. * * \param state encoder state * \param x x-coordinate of the CU * \param y y-coordinate of the CU * \param depth depth of the CU in the quadtree * \param part_mode partition mode to search * \param lcu containing LCU * * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ void kvz_search_cu_smp(encoder_state_t * const state, int x, int y, int depth, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, uint32_t *inter_bitcost) { const int num_pu = kvz_part_mode_num_parts[part_mode]; const int width = LCU_WIDTH >> depth; const int y_local = SUB_SCU(y); const int x_local = SUB_SCU(x); *inter_cost = 0; *inter_bitcost = 0; for (int i = 0; i < num_pu; ++i) { const int x_pu = PU_GET_X(part_mode, width, x_local, i); const int y_pu = PU_GET_Y(part_mode, width, y_local, i); const int width_pu = PU_GET_W(part_mode, width, i); const int height_pu = PU_GET_H(part_mode, width, i); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); cur_pu->type = CU_INTER; cur_pu->part_size = part_mode; cur_pu->depth = depth; double cost = MAX_INT; uint32_t bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); *inter_cost += cost; *inter_bitcost += bitcost; for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); scu->type = CU_INTER; scu->inter = cur_pu->inter; } } } }