Merge branch 'mv-constraint'

This commit is contained in:
Ari Koivula 2016-02-29 23:33:15 +02:00
commit c02b0f4186
7 changed files with 171 additions and 48 deletions

View file

@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
#
# Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
ver_major=3
ver_minor=2
ver_minor=4
ver_release=0
# not used, but it prevents configure from adding a lot of defines to the CFLAGS

View file

@ -102,6 +102,8 @@ int kvz_config_init(kvz_config *cfg)
cfg->add_encoder_info = true;
cfg->calc_psnr = true;
cfg->mv_constraint = KVZ_MV_CONSTRAIN_NONE;
return 1;
}
@ -279,6 +281,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", NULL };
static const char * const colormatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m",
"smpte240m", "YCgCo", "bt2020nc", "bt2020c", NULL };
static const char * const mv_constraint_names[] = { "none", "frame", "tile", "frametile", "frametilemargin", NULL };
static const char * const preset_values[11][28] = {
{
@ -533,6 +536,13 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
cfg->fme_level = atoi(value);
else if OPT("source-scan-type")
return parse_enum(value, source_scan_type_names, &cfg->source_scan_type);
else if OPT("mv-constraint")
{
int8_t constraint = KVZ_MV_CONSTRAIN_NONE;
int result = parse_enum(value, mv_constraint_names, &constraint);
cfg->mv_constraint = constraint;
return result;
}
else if OPT("sar")
return sscanf(value, "%d:%d", &cfg->vui.sar_width, &cfg->vui.sar_height) == 2;
else if OPT("overscan")

View file

@ -102,6 +102,7 @@ static const struct option long_options[] = {
{ "version", no_argument, NULL, 0 },
{ "help", no_argument, NULL, 0 },
{ "loop-input", no_argument, NULL, 0 },
{ "mv-constraint", required_argument, NULL, 0 },
{0, 0, 0, 0}
};
@ -363,6 +364,10 @@ void print_help(void)
" fast, medium, slow, slower, veryslow, placebo\n"
" --no-psnr : Don't calculate PSNR for frames\n"
" --loop-input : Re-read input file forever\n"
" --mv-constraint : Constrain movement vectors\n"
" \"none\": no constraint\n"
" \"frametile\": constrain within the tile\n"
" \"frametilemargin\": constrain even more\n"
"\n"
" Video Usability Information:\n"
" --sar <width:height> : Specify Sample Aspect Ratio\n"

View file

@ -577,10 +577,19 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
#endif
main_state->children[i].tqj_recon_done = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->children[i]), 1, job_description);
if (main_state->children[i].previous_encoder_state != &main_state->children[i] && main_state->children[i].previous_encoder_state->tqj_recon_done && !main_state->children[i].global->is_idr_frame) {
// Add dependancy to each child in the previous frame.
// TODO: Make it so that only adjacent tiles are dependet upon and search is constrained to those?
for (int child_id = 0; main_state->children[child_id].encoder_control; ++child_id) {
kvz_threadqueue_job_dep_add(main_state->children[i].tqj_recon_done, main_state->children[child_id].previous_encoder_state->tqj_recon_done);
#if 0
// Disabled due to non-determinism.
if (main_state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN)
{
// When MV's don't cross tile boundaries, add dependancy only to the same tile.
kvz_threadqueue_job_dep_add(main_state->children[i].tqj_recon_done, main_state->children[i].previous_encoder_state->tqj_recon_done);
} else
#endif
{
// Add dependancy to each child in the previous frame.
for (int child_id = 0; main_state->children[child_id].encoder_control; ++child_id) {
kvz_threadqueue_job_dep_add(main_state->children[i].tqj_recon_done, main_state->children[child_id].previous_encoder_state->tqj_recon_done);
}
}
}
kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, main_state->children[i].tqj_recon_done);

View file

@ -101,6 +101,19 @@ enum kvz_interlacing
KVZ_INTERLACING_BFF = 2, // bottom field first
};
/**
* \brief Constrain movement vectors.
* \since 3.3.0
*/
enum kvz_mv_constraint
{
KVZ_MV_CONSTRAIN_NONE = 0,
KVZ_MV_CONSTRAIN_FRAME = 1, // Don't refer outside the frame.
KVZ_MV_CONSTRAIN_TILE = 2, // Don't refer to other tiles.
KVZ_MV_CONSTRAIN_FRAME_AND_TILE = 3, // Don't refer outside the tile.
KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN = 4, // Keep enough margin for fractional pixel margins not to refer outside the tile.
};
/**
* \brief GoP picture configuration.
*/
@ -204,6 +217,8 @@ typedef struct kvz_config
int8_t mv_rdo; /*!< \brief MV RDO calculation in search (0: estimation, 1: RDO). */
int8_t calc_psnr; /*!< \since 3.1.0 \brief Print PSNR in CLI. */
enum kvz_mv_constraint mv_constraint; /*!< \since 3.3.0 \brief Constrain movement vectors. */
} kvz_config;
/**

View file

@ -524,7 +524,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
&& WITHIN(depth, ctrl->pu_depth_inter.min, ctrl->pu_depth_inter.max);
if (can_use_inter) {
int mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_tree[depth]);
double mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_tree[depth]);
if (mode_cost < cost) {
cost = mode_cost;
cur_cu->type = CU_INTER;

View file

@ -28,6 +28,46 @@
#include "rdo.h"
/**
* \return True if referred block is within current tile.
*/
static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int wpp_limit)
{
if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
return (wpp_limit == -1 || y + (width << 2) <= (wpp_limit << 2));
};
int lt_margin = 0;
int rb_margin = 0;
if (KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
// Enforce a distance of 8 from any tile boundary.
lt_margin = 8;
rb_margin = 16;
}
// TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE.
const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y };
if (abs_mv.x >= lt_margin && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) - rb_margin &&
abs_mv.y >= lt_margin && abs_mv.y + (width << 2) <= (state->tile->frame->height << 2) - rb_margin &&
(wpp_limit == -1 || y + (width << 2) <= (wpp_limit << 2)))
{
return true;
} else {
return false;
}
}
/**
* \return True if referred block is within current tile.
*/
static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int wpp_limit)
{
return fracmv_within_tile(state, orig, x << 2, y << 2, width, wpp_limit);
}
static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
{
int32_t num_bins = 0;
@ -125,7 +165,7 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int
unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
int width, int height, int max_px_below_lcu)
int width, int height, int wpp_limit)
{
int n_points;
int best_index = -1;
@ -243,6 +283,10 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
for (i = 0; i < n_points; i++)
{
vector2d_t *current = &pattern[pattern_type][i];
if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width, wpp_limit)) {
continue;
}
unsigned cost;
uint32_t bitcost;
@ -251,7 +295,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -287,7 +331,7 @@ unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_pi
unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
int width, int height, int iSearchRange, int iRaster, int max_px_below_lcu)
int width, int height, int iSearchRange, int iRaster, int wpp_limit)
{
int i;
int k;
@ -307,6 +351,10 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic
for (k = -iSearchRange; k <= iSearchRange; k += iRaster)
{
vector2d_t current = { k, i };
if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width, wpp_limit)) {
continue;
}
unsigned cost;
uint32_t bitcost;
@ -315,7 +363,7 @@ unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_pic
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -366,7 +414,7 @@ static unsigned tz_search(const encoder_state_t * const state,
int iDist;
int best_dist = 0;
unsigned best_index = num_cand;
int max_px_below_lcu = -1;
int wpp_limit = -1;
int(*calc_mvd)(const encoder_state_t * const, int, int, int,
int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -376,28 +424,28 @@ static unsigned tz_search(const encoder_state_t * const state,
}
if (state->encoder_control->owf) {
max_px_below_lcu = LCU_WIDTH;
wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH;
if (state->encoder_control->fme_level > 0) {
// Fractional motion estimation can change the mv by at most 1 pixel.
max_px_below_lcu -= 1;
wpp_limit -= 1;
}
if (state->encoder_control->deblock_enable) {
// Strong deblock filter modifies 3 pixels.
max_px_below_lcu -= 3;
wpp_limit -= 3;
}
}
//step 1, compare (0,0) vector to predicted vectors
// Check whatever input vector we got, unless its (0, 0) which will be checked later.
if (mv.x || mv.y)
if ((mv.x || mv.y) && intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit))
{
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
width, height, max_px_below_lcu);
width, height, -1);
best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -415,6 +463,9 @@ static unsigned tz_search(const encoder_state_t * const state,
if (merge_cand[i].dir == 3) continue;
mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
if (!intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) {
continue;
}
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
@ -422,7 +473,7 @@ static unsigned tz_search(const encoder_state_t * const state,
unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -450,7 +501,7 @@ static unsigned tz_search(const encoder_state_t * const state,
for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
{
best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
}
//step 3, raster scan
@ -459,7 +510,7 @@ static unsigned tz_search(const encoder_state_t * const state,
best_dist = iRaster;
best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand,
num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, max_px_below_lcu);
num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, wpp_limit);
}
//step 4
@ -471,7 +522,7 @@ static unsigned tz_search(const encoder_state_t * const state,
while (iDist > 0)
{
best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
iDist = iDist >> 1;
}
@ -483,7 +534,7 @@ static unsigned tz_search(const encoder_state_t * const state,
for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
{
best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
}
}
@ -552,7 +603,7 @@ static unsigned hexagon_search(const encoder_state_t * const state,
uint32_t best_bitcost = 0, bitcost;
unsigned i;
unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
int max_px_below_lcu = -1;
int wpp_limit = -1;
int (*calc_mvd)(const encoder_state_t * const, int, int, int,
int16_t[2][2], inter_merge_cand_t[MRG_MAX_NUM_CANDS],
@ -561,16 +612,15 @@ static unsigned hexagon_search(const encoder_state_t * const state,
calc_mvd = kvz_calc_mvd_cost_cabac;
}
if (state->encoder_control->owf) {
max_px_below_lcu = LCU_WIDTH;
wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH;
if (state->encoder_control->fme_level > 0) {
// Fractional motion estimation can change the mv by at most 1 pixel.
max_px_below_lcu -= 1;
wpp_limit -= 1;
}
if (state->encoder_control->deblock_enable) {
// Strong deblock filter modifies 3 pixels.
max_px_below_lcu -= 3;
wpp_limit -= 3;
}
}
@ -585,13 +635,15 @@ static unsigned hexagon_search(const encoder_state_t * const state,
}
}
if (!mv_in_merge_cand) {
if (!mv_in_merge_cand &&
intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit))
{
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
width, height, max_px_below_lcu);
width, height, -1);
best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
best_bitcost = bitcost;
best_index = num_cand;
@ -609,13 +661,16 @@ static unsigned hexagon_search(const encoder_state_t * const state,
if (merge_cand[i].dir == 3) continue;
mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
if (!intmv_within_tile(state, orig, mv.x, mv.y, width, wpp_limit)) {
continue;
}
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -642,13 +697,17 @@ static unsigned hexagon_search(const encoder_state_t * const state,
best_index = 0;
for (i = 0; i < 7; ++i) {
const vector2d_t *pattern = &large_hexbs[i];
if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, wpp_limit)) {
continue;
}
unsigned cost;
{
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
@ -685,13 +744,17 @@ static unsigned hexagon_search(const encoder_state_t * const state,
// Iterate through the next 3 points.
for (i = 0; i < 3; ++i) {
const vector2d_t *offset = &large_hexbs[start + i];
if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, wpp_limit)) {
continue;
}
unsigned cost;
{
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
@ -717,13 +780,17 @@ static unsigned hexagon_search(const encoder_state_t * const state,
// Do the final step of the search with a small pattern.
for (i = 1; i < 5; ++i) {
const vector2d_t *offset = &small_hexbs[i];
if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, wpp_limit)) {
continue;
}
unsigned cost;
{
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
width, height, max_px_below_lcu);
width, height, -1);
cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
(state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
@ -847,12 +914,21 @@ static unsigned search_frac(const encoder_state_t * const state,
{ -1, -1 }, { 0, -1 }, { 1, -1 }
};
int wpp_limit = -1;
if (state->encoder_control->owf) {
wpp_limit = 2 * LCU_WIDTH - orig->x % LCU_WIDTH;
if (state->encoder_control->deblock_enable) {
// Strong deblock filter modifies 3 pixels.
wpp_limit -= 3;
}
}
//Set mv to halfpel precision
vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
unsigned best_cost = UINT32_MAX;
uint32_t best_bitcost = 0, bitcost;
unsigned i;
unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
unsigned best_index = 4;
unsigned cost = 0;
@ -896,6 +972,9 @@ static unsigned search_frac(const encoder_state_t * const state,
// Search halfpel positions around best integer mv
for (i = 0; i < 9; ++i) {
const vector2d_t *pattern = &square[i];
if (!fracmv_within_tile(state, orig, (mv.x + pattern->x) << 1, (mv.y + pattern->y) << 1, width, wpp_limit)) {
continue;
}
int y,x;
for(y = 0; y < height; ++y) {
@ -911,7 +990,6 @@ static unsigned search_frac(const encoder_state_t * const state,
tmp_filtered, width);
cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
if (cost < best_cost) {
best_cost = cost;
best_index = i;
@ -920,12 +998,12 @@ static unsigned search_frac(const encoder_state_t * const state,
}
}
//Set mv to best match
// Move search to best_index
mv.x += square[best_index].x;
mv.y += square[best_index].y;
halfpel_offset.x = square[best_index].x*2;
halfpel_offset.y = square[best_index].y*2;
best_index = 4;
//Set mv to quarterpel precision
mv.x <<= 1;
@ -934,6 +1012,9 @@ static unsigned search_frac(const encoder_state_t * const state,
//Search quarterpel points around best halfpel mv
for (i = 0; i < 9; ++i) {
const vector2d_t *pattern = &square[i];
if (!fracmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, wpp_limit)) {
continue;
}
int y,x;
for(y = 0; y < height; ++y) {
@ -1081,7 +1162,7 @@ static void search_pu_inter_ref(const encoder_state_t * const state,
ref_idx,
&temp_bitcost);
}
merged = 0;
// Check every candidate to find a match
for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
@ -1206,7 +1287,7 @@ static int search_pu_inter(const encoder_state_t * const state,
cur_cu->inter.mv_cand[0] = 0;
cur_cu->inter.mv_cand[1] = 0;
cur_cu->inter.cost = UINT_MAX;
cur_cu->inter.cost = INT_MAX;
uint32_t ref_idx;
for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
@ -1261,16 +1342,12 @@ static int search_pu_inter(const encoder_state_t * const state,
mv[1][0] = merge_cand[j].mv[1][0];
mv[1][1] = merge_cand[j].mv[1][1];
// Check boundaries when using owf to process multiple frames at the same time
if (max_px_below_lcu >= 0) {
// When SAO is off, row is considered reconstructed when the last LCU
// is done, although the bottom 2 pixels might still need deblocking.
// To work around this, add 2 luma pixels to the reach of the mv
// in order to avoid referencing those possibly non-deblocked pixels.
int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
int cur_lcu_row = y / LCU_WIDTH;
if (mv_lcu_row_reach_1 > cur_lcu_row + max_px_below_lcu || mv_lcu_row_reach_2 > cur_lcu_row + max_px_below_lcu) {
{
// Don't try merge candidates that don't satisfy mv constraints.
vector2d_t orig = { x, y };
if (fracmv_within_tile(state, &orig, mv[0][0], mv[0][1], width, -1) ||
fracmv_within_tile(state, &orig, mv[1][0], mv[1][1], width, -1))
{
continue;
}
}
@ -1364,6 +1441,13 @@ static int search_pu_inter(const encoder_state_t * const state,
FREE_POINTER(templcu);
}
if (cur_cu->inter.cost < INT_MAX) {
const vector2d_t orig = { x, y };
if (cur_cu->inter.mv_dir == 1) {
assert(fracmv_within_tile(state, &orig, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1], width, -1));
}
}
return cur_cu->inter.cost;
}