From d6b2ec58147c76c9f8ee0f9e65b17ce336bde25e Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 18:47:14 +0200 Subject: [PATCH 01/85] Only check used reference picture lists when validating merge candidates. Merge candidate struct should be initialized to zero, so this should not have any effect. The conditions are added in case someone decides to copy the code as an example. --- src/search_inter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index f246e48b..216bbb49 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1709,8 +1709,10 @@ static void search_pu_inter(encoder_state_t * const state, // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list - if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || - !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || + bool active_L0 = cur_cu->inter.mv_dir & 1; + bool active_L1 = cur_cu->inter.mv_dir & 2; + if (active_L0 && !fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || + active_L1 && !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || is_duplicate) { continue; From f1f0033bf57a2a837177d3fd2a5136be94183039 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 2 Dec 2021 10:42:30 +0200 Subject: [PATCH 02/85] Add a cli option to control whether intra cus are tried to combine on the lower depth when search for said depth is disabled --- src/cfg.c | 5 +++++ src/cli.c | 8 ++++++++ src/kvazaar.h | 3 +++ src/search.c | 3 ++- 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/cfg.c b/src/cfg.c index 07c71a55..c8a3dfa4 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -183,6 +183,8 @@ int kvz_config_init(kvz_config *cfg) cfg->fastrd_sampling_on = 0; cfg->fastrd_accuracy_check_on = 0; cfg->fastrd_learning_outdir_fn = NULL; + + cfg->combine_intra_cus = 1; return 1; } @@ -1421,6 +1423,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) else if OPT("stats-file-prefix") { cfg->stats_file_prefix = strdup(value); } + else if OPT("combine-intra-cus") { + cfg->combine_intra_cus = atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index 811537b3..2212aa9b 100644 --- a/src/cli.c +++ b/src/cli.c @@ -167,6 +167,8 @@ static const struct option long_options[] = { { "fastrd-sampling", no_argument, NULL, 0 }, { "fastrd-accuracy-check", no_argument, NULL, 0 }, { "fastrd-outdir", required_argument, NULL, 0 }, + { "combine-intra-cus", no_argument, NULL, 0 }, + { "no-combine-intra-cus", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -578,6 +580,12 @@ void print_help(void) " --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n" " learning trees, overrides the\n" " --pu-depth-intra parameter. [disabled]\n" + " --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n" + " on lower depth even when search is not\n" + " performed on said depth. Should only\n" + " be disabled if cus absolutely must not\n" + " be larger than limited by the search.\n" + " [enabled]" " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" diff --git a/src/kvazaar.h b/src/kvazaar.h index f03ffa27..0e6779b4 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -479,6 +479,9 @@ typedef struct kvz_config char *fastrd_learning_outdir_fn; + /** \brief whether to try combining intra cus at the lower depth when search + * is not performed at said depth*/ + uint8_t combine_intra_cus; } kvz_config; /** diff --git a/src/search.c b/src/search.c index 909e7aa5..d2de84cb 100644 --- a/src/search.c +++ b/src/search.c @@ -754,7 +754,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // gets used, at least in the most obvious cases, while avoiding any // searching. if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH - && x + cu_width <= frame->width && y + cu_width <= frame->height) + && x + cu_width <= frame->width && y + cu_width <= frame->height + && state->encoder_control->cfg.combine_intra_cus) { cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); From ec2f4e0bac18f9c5b077713168fb91495ab5e17a Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 19:32:45 +0200 Subject: [PATCH 03/85] Use double for RD costs in most places --- src/rdo.c | 20 ++++++++--------- src/search.c | 4 ++-- src/search_inter.c | 53 +++++++++++++++++++++++----------------------- src/search_inter.h | 2 +- src/search_intra.c | 5 ++--- src/transform.c | 8 +++---- 6 files changed, 45 insertions(+), 47 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index ec713603..5403fa61 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1029,15 +1029,15 @@ uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, * \returns int * Calculates Motion Vector cost and related costs using CABAC coding */ -uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, - int x, - int y, - int mv_shift, - int16_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, + int x, + int y, + int mv_shift, + int16_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + uint32_t *bitcost) { cabac_data_t state_cabac_copy; cabac_data_t* cabac; @@ -1174,7 +1174,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); // Store bitcost before restoring cabac - return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5); + return *bitcost * state->lambda_sqrt; } void kvz_close_rdcost_outfiles(void) diff --git a/src/search.c b/src/search.c index 909e7aa5..4345ad75 100644 --- a/src/search.c +++ b/src/search.c @@ -462,8 +462,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const encoder_control_t* ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; - double cost = MAX_INT; - double inter_zero_coeff_cost = MAX_INT; + double cost = MAX_DOUBLE; + double inter_zero_coeff_cost = MAX_DOUBLE; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; diff --git a/src/search_inter.c b/src/search_inter.c index 216bbb49..1b705e4f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -85,7 +85,7 @@ typedef struct { /** * \brief Cost of best_mv */ - uint32_t best_cost; + double best_cost; /** * \brief Bit cost of best_mv */ @@ -390,15 +390,15 @@ static int select_mv_cand(const encoder_state_t *state, } -static uint32_t calc_mvd_cost(const encoder_state_t *state, - int x, - int y, - int mv_shift, - int16_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +static double calc_mvd_cost(const encoder_state_t *state, + int x, + int y, + int mv_shift, + int16_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + uint32_t *bitcost) { uint32_t temp_bitcost = 0; uint32_t merge_idx; @@ -428,7 +428,7 @@ static uint32_t calc_mvd_cost(const encoder_state_t *state, temp_bitcost += mvd_cost; } *bitcost = temp_bitcost; - return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5); + return temp_bitcost * state->lambda_sqrt; } @@ -624,7 +624,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -732,7 +732,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -832,7 +832,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 {0, 0} }; - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -997,11 +997,12 @@ static void search_frac(inter_search_info_t *info) // Set mv to pixel precision vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; - unsigned best_cost = UINT32_MAX; + double best_cost = MAX_DOUBLE; uint32_t best_bitcost = 0; uint32_t bitcosts[4] = { 0 }; unsigned best_index = 0; +// Keep this as unsigned until SAD / SATD functions are updated unsigned costs[4] = { 0 }; ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE]; @@ -1338,7 +1339,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, default: break; } - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; switch (cfg->ime_algorithm) { case KVZ_IME_TZ: @@ -1365,7 +1366,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, if (cfg->fme_level > 0 && info->best_cost < *inter_cost) { search_frac(info); - } else if (info->best_cost < UINT32_MAX) { + } else if (info->best_cost < MAX_DOUBLE) { // Recalculate inter cost with SATD. info->best_cost = kvz_image_calc_satd( info->state->tile->frame->source, @@ -1376,7 +1377,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), info->width, info->height); - info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5); + info->best_cost += info->best_bitcost * info->state->lambda_sqrt; } mv = info->best_mv; @@ -1504,7 +1505,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &frame->source->y[x + y * frame->source->width]; - uint32_t cost = + double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->width); uint32_t bitcost[2] = { 0, 0 }; @@ -1529,7 +1530,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, merge_cand[j].ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info->state->lambda_sqrt * extra_bits + 0.5; + cost += info->state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; @@ -1630,7 +1631,7 @@ static void search_pu_inter(encoder_state_t * const state, double *inter_cost, uint32_t *inter_bitcost) { - *inter_cost = MAX_INT; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; const kvz_config *cfg = &state->encoder_control->cfg; @@ -1826,7 +1827,7 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - uint32_t cost = + double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); uint32_t bitcost[2] = { 0, 0 }; @@ -1851,7 +1852,7 @@ static void search_pu_inter(encoder_state_t * const state, unipreds[1].inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info.state->lambda_sqrt * extra_bits + 0.5; + cost += info.state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; @@ -2056,14 +2057,14 @@ void kvz_search_cu_smp(encoder_state_t * const state, cur_pu->depth = depth; cur_pu->qp = state->qp; - double cost = MAX_INT; + double cost = MAX_DOUBLE; uint32_t bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); - if (cost >= MAX_INT) { + if (cost == MAX_DOUBLE) { // Could not find any motion vector. - *inter_cost = MAX_INT; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; return; } diff --git a/src/search_inter.h b/src/search_inter.h index 0d7fb81b..8b4b16f2 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -64,7 +64,7 @@ enum hpel_position { HPEL_POS_DIA = 2 }; -typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state, +typedef double kvz_mvd_cost_func(const encoder_state_t *state, int x, int y, int mv_shift, int16_t mv_cand[2][2], diff --git a/src/search_intra.c b/src/search_intra.c index 9cf984db..6d3aa141 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -524,9 +524,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Add prediction mode coding cost as the last thing. We don't want this // affecting the halving search. - int lambda_cost = (int)(state->lambda_sqrt + 0.5); for (int mode_i = 0; mode_i < modes_selected; ++mode_i) { - costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds); + costs[mode_i] += state->lambda_sqrt * kvz_luma_mode_bits(state, modes[mode_i], intra_preds); } #undef PARALLEL_BLKS @@ -595,7 +594,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) { int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds); - costs[rdo_mode] = rdo_bitcost * (int)(state->lambda + 0.5); + costs[rdo_mode] = rdo_bitcost * state->lambda; // Perform transform split search and save mode RD cost for the best one. cu_info_t pred_cu; diff --git a/src/transform.c b/src/transform.c index f8e6325f..7a339e27 100644 --- a/src/transform.c +++ b/src/transform.c @@ -250,25 +250,23 @@ int kvz_quantize_residual_trskip( struct { kvz_pixel rec[4*4]; coeff_t coeff[4*4]; - uint32_t cost; + double cost; int has_coeffs; } skip, noskip, *best; - - const int bit_cost = (int)(state->lambda + 0.5); noskip.has_coeffs = kvz_quantize_residual( state, cur_cu, width, color, scan_order, 0, in_stride, 4, ref_in, pred_in, noskip.rec, noskip.coeff, false); noskip.cost = kvz_pixels_calc_ssd(ref_in, noskip.rec, in_stride, 4, 4); - noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost; + noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * state->lambda; skip.has_coeffs = kvz_quantize_residual( state, cur_cu, width, color, scan_order, 1, in_stride, 4, ref_in, pred_in, skip.rec, skip.coeff, false); skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, 4, 4); - skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * bit_cost; + skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * state->lambda; if (noskip.cost <= skip.cost) { *trskip_out = 0; From e000c7229fb1b5b3f210c1bf7a5c25da40cc24b3 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 18:54:08 +0200 Subject: [PATCH 04/85] Fix bit costs in search_pu_inter_ref a bit --- src/search_inter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 1b705e4f..3eb0f840 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1398,16 +1398,17 @@ static void search_pu_inter_ref(inter_search_info_t *info, } // Only check when candidates are different + uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = 0; if (!merged) { cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; } if (info->best_cost < *inter_cost) { // Map reference index to L0/L1 pictures cur_cu->inter.mv_dir = ref_list+1; - uint8_t mv_ref_coded = LX_idx; cur_cu->merged = merged; cur_cu->merge_idx = merge_idx; @@ -1418,7 +1419,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded; + *inter_bitcost = info->best_bitcost; } From 3265d45a4e5d8b69c5fa4a6617810b881130d8e9 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 29 Nov 2021 02:02:52 +0200 Subject: [PATCH 05/85] Temporarily remove FME threshold for verification purposes --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 3eb0f840..b2f4a765 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1363,7 +1363,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, break; } - if (cfg->fme_level > 0 && info->best_cost < *inter_cost) { + if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { search_frac(info); } else if (info->best_cost < MAX_DOUBLE) { From 936fb766852e669e88946802f64d58d46ee27fa9 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 23:47:10 +0200 Subject: [PATCH 06/85] Remove merge candidate stuff from search_pu_inter_ref There is a separate merge analysis now --- src/search_inter.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index b2f4a765..f091c260 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1382,36 +1382,17 @@ static void search_pu_inter_ref(inter_search_info_t *info, mv = info->best_mv; - int merged = 0; - int merge_idx = 0; - // Check every candidate to find a match - for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (info->merge_cand[merge_idx].dir != 3 && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][0] == mv.x && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][1] == mv.y && - (uint32_t)info->state->frame->ref_LX[info->merge_cand[merge_idx].dir - 1][ - info->merge_cand[merge_idx].ref[info->merge_cand[merge_idx].dir - 1]] == info->ref_idx) - { - merged = 1; - break; - } - } - // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = 0; - if (!merged) { - cu_mv_cand = - select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); - info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; - } + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; if (info->best_cost < *inter_cost) { // Map reference index to L0/L1 pictures cur_cu->inter.mv_dir = ref_list+1; - cur_cu->merged = merged; - cur_cu->merge_idx = merge_idx; + cur_cu->merged = false; + cur_cu->skipped = false; cur_cu->inter.mv_ref[ref_list] = LX_idx; cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; @@ -1428,6 +1409,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); if (valid_mv) { // Map reference index to L0/L1 pictures + unipred_LX[ref_list].merged = false; + unipred_LX[ref_list].skipped = false; unipred_LX[ref_list].inter.mv_dir = ref_list + 1; unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx; unipred_LX[ref_list].inter.mv[ref_list][0] = (int16_t)mv.x; From 90c0a708a799ae01896d0c0943d0a8a104cd98b7 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 28 Nov 2021 23:40:16 +0200 Subject: [PATCH 07/85] Add new structs for storing statistics during the search. Use in AMVP search. --- src/search.c | 20 ++++++++++ src/search.h | 15 ++++++++ src/search_inter.c | 91 +++++++++++++++++++++++++++++++--------------- 3 files changed, 96 insertions(+), 30 deletions(-) diff --git a/src/search.c b/src/search.c index 4345ad75..385c4981 100644 --- a/src/search.c +++ b/src/search.c @@ -415,6 +415,7 @@ static double calc_mode_bits(const encoder_state_t *state, } +// TODO: replace usages of this by the kvz_sort_indices_by_cost function. /** * \brief Sort modes and costs to ascending order according to costs. */ @@ -439,6 +440,25 @@ void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t } +/** + * \brief Sort indices to ascending order according to costs. + */ +void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map) +{ + // Size of sorted arrays is expected to be "small". No need for faster algorithm. + for (uint8_t i = 1; i < map->size; ++i) { + const int8_t cur_idx = map->idx[i]; + const double cur_cost = map->stats[cur_idx].cost; + uint8_t j = i; + while (j > 0 && cur_cost < map->stats[map->idx[j - 1]].cost) { + map->idx[j] = map->idx[j - 1]; + --j; + } + map->idx[j] = cur_idx; + } +} + + static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) { vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) }; diff --git a/src/search.h b/src/search.h index 774a4d7b..fe6d7f5d 100644 --- a/src/search.h +++ b/src/search.h @@ -44,7 +44,22 @@ #include "image.h" #include "constraint.h" +typedef struct blk_stats_t { + + cu_info_t blk; // list of blocks + double cost; // list of RD costs + uint32_t bits; // list of bit costs +} blk_stats_t; + +typedef struct blk_stats_map_t { + + blk_stats_t *stats; // list of block statistics entries + int8_t *idx; // list of indices to block stats (to be sorted by costs) + int size; // number of active elements in the lists +} blk_stats_map_t; + void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); +void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index f091c260..d561387a 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1215,11 +1215,12 @@ static void apply_mv_scaling(int32_t current_poc, */ static void search_pu_inter_ref(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, + lcu_t *lcu, + cu_info_t *cur_cu, double *inter_cost, uint32_t *inter_bitcost, double *best_LX_cost, - cu_info_t *unipred_LX) + blk_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1409,15 +1410,23 @@ static void search_pu_inter_ref(inter_search_info_t *info, bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); if (valid_mv) { // Map reference index to L0/L1 pictures - unipred_LX[ref_list].merged = false; - unipred_LX[ref_list].skipped = false; - unipred_LX[ref_list].inter.mv_dir = ref_list + 1; - unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx; - unipred_LX[ref_list].inter.mv[ref_list][0] = (int16_t)mv.x; - unipred_LX[ref_list].inter.mv[ref_list][1] = (int16_t)mv.y; + blk_stats_map_t *cur_map = &amvp[ref_list]; + blk_stats_t *entry = &cur_map->stats[cur_map->size]; + cu_info_t *pb = &entry->blk; + pb->merged = false; + pb->skipped = false; + pb->inter.mv_dir = ref_list + 1; + pb->inter.mv_ref[ref_list] = LX_idx; + pb->inter.mv[ref_list][0] = (int16_t)mv.x; + pb->inter.mv[ref_list][1] = (int16_t)mv.y; - CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand); + CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + entry->cost = info->best_cost; + entry->bits = info->best_bitcost; + cur_map->size++; + + // TODO: remove (this is just to keep old functionality) best_LX_cost[ref_list] = info->best_cost; } } @@ -1669,6 +1678,7 @@ static void search_pu_inter(encoder_state_t * const state, mrg_costs[i] = MAX_DOUBLE; } + cu_info_t orig_cu = *cur_cu; int num_rdo_cands = 0; // Check motion vector constraints and perform rough search @@ -1765,16 +1775,31 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - cu_info_t unipreds[2]; + double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; // TODO: remove + blk_stats_t stats[2][MAX_REF_PIC_COUNT]; + int8_t idx[2][MAX_REF_PIC_COUNT]; + blk_stats_map_t amvp[2]; + + for (int ref_list = 0; ref_list < 2; ++ref_list) { + amvp[ref_list].stats = stats[ref_list]; + amvp[ref_list].idx = idx [ref_list]; + amvp[ref_list].size = 0; + for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { + amvp[ref_list].stats[i].blk = orig_cu; + amvp[ref_list].idx[i] = i; + } + } for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds); + search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, amvp); } + kvz_sort_indices_by_cost(&amvp[0]); + kvz_sort_indices_by_cost(&amvp[1]); + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -1792,15 +1817,21 @@ static void search_pu_inter(encoder_state_t * const state, inter_merge_cand_t *merge_cand = info.merge_cand; + int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; + cu_info_t *best_unipred[2] = { + &amvp[0].stats[best_idx[0]].blk, + &amvp[1].stats[best_idx[1]].blk + }; + int16_t mv[2][2]; - mv[0][0] = unipreds[0].inter.mv[0][0]; - mv[0][1] = unipreds[0].inter.mv[0][1]; - mv[1][0] = unipreds[1].inter.mv[1][0]; - mv[1][1] = unipreds[1].inter.mv[1][1]; + mv[0][0] = best_unipred[0]->inter.mv[0][0]; + mv[0][1] = best_unipred[0]->inter.mv[0][1]; + mv[1][0] = best_unipred[1]->inter.mv[1][0]; + mv[1][1] = best_unipred[1]->inter.mv[1][1]; kvz_inter_recon_bipred(info.state, - ref->images[ref_LX[0][unipreds[0].inter.mv_ref[0]]], - ref->images[ref_LX[1][unipreds[1].inter.mv_ref[1]]], + ref->images[ref_LX[0][best_unipred[0]->inter.mv_ref[0]]], + ref->images[ref_LX[1][best_unipred[1]->inter.mv_ref[1]]], x, y, width, height, @@ -1817,23 +1848,23 @@ static void search_pu_inter(encoder_state_t * const state, uint32_t bitcost[2] = { 0, 0 }; cost += info.mvd_cost_func(info.state, - unipreds[0].inter.mv[0][0], - unipreds[0].inter.mv[0][1], + best_unipred[0]->inter.mv[0][0], + best_unipred[0]->inter.mv[0][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[0]); cost += info.mvd_cost_func(info.state, - unipreds[1].inter.mv[1][0], - unipreds[1].inter.mv[1][1], + best_unipred[1]->inter.mv[1][0], + best_unipred[1]->inter.mv[1][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[1]); const uint8_t mv_ref_coded[2] = { - unipreds[0].inter.mv_ref[0], - unipreds[1].inter.mv_ref[1] + best_unipred[0]->inter.mv_ref[0], + best_unipred[1]->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info.state->lambda_sqrt * extra_bits; @@ -1841,13 +1872,13 @@ static void search_pu_inter(encoder_state_t * const state, if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; - cur_cu->inter.mv_ref[0] = unipreds[0].inter.mv_ref[0]; - cur_cu->inter.mv_ref[1] = unipreds[1].inter.mv_ref[1]; + cur_cu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + cur_cu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - cur_cu->inter.mv[0][0] = unipreds[0].inter.mv[0][0]; - cur_cu->inter.mv[0][1] = unipreds[0].inter.mv[0][1]; - cur_cu->inter.mv[1][0] = unipreds[1].inter.mv[1][0]; - cur_cu->inter.mv[1][1] = unipreds[1].inter.mv[1][1]; + cur_cu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; + cur_cu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; + cur_cu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; + cur_cu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; cur_cu->merged = 0; // Check every candidate to find a match From 2ed434e57bcd7bbe351d1d335db101c32420850a Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 29 Nov 2021 02:16:28 +0200 Subject: [PATCH 08/85] Remove now deprecated array --- src/search_inter.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index d561387a..514ca1a9 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1219,7 +1219,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, cu_info_t *cur_cu, double *inter_cost, uint32_t *inter_bitcost, - double *best_LX_cost, blk_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1406,29 +1405,24 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Update best unipreds for biprediction - if (info->best_cost < best_LX_cost[ref_list]) { - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { - // Map reference index to L0/L1 pictures - blk_stats_map_t *cur_map = &amvp[ref_list]; - blk_stats_t *entry = &cur_map->stats[cur_map->size]; - cu_info_t *pb = &entry->blk; - pb->merged = false; - pb->skipped = false; - pb->inter.mv_dir = ref_list + 1; - pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)mv.x; - pb->inter.mv[ref_list][1] = (int16_t)mv.y; + bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); + if (valid_mv) { + // Map reference index to L0/L1 pictures + blk_stats_map_t *cur_map = &amvp[ref_list]; + blk_stats_t *entry = &cur_map->stats[cur_map->size]; + cu_info_t *pb = &entry->blk; + pb->merged = false; + pb->skipped = false; + pb->inter.mv_dir = ref_list + 1; + pb->inter.mv_ref[ref_list] = LX_idx; + pb->inter.mv[ref_list][0] = (int16_t)mv.x; + pb->inter.mv[ref_list][1] = (int16_t)mv.y; - CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); - entry->cost = info->best_cost; - entry->bits = info->best_bitcost; - cur_map->size++; - - // TODO: remove (this is just to keep old functionality) - best_LX_cost[ref_list] = info->best_cost; - } + entry->cost = info->best_cost; + entry->bits = info->best_bitcost; + cur_map->size++; } } @@ -1775,7 +1769,6 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; // TODO: remove blk_stats_t stats[2][MAX_REF_PIC_COUNT]; int8_t idx[2][MAX_REF_PIC_COUNT]; blk_stats_map_t amvp[2]; @@ -1794,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state, info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, amvp); + search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, amvp); } kvz_sort_indices_by_cost(&amvp[0]); @@ -1808,7 +1801,7 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { // Try biprediction from valid acquired unipreds. - if (best_cost_LX[0] != MAX_DOUBLE && best_cost_LX[1] != MAX_DOUBLE) { + if (amvp[0].size > 0 && amvp[1].size > 0) { // TODO: logic is copy paste from search_pu_inter_bipred. // Get rid of duplicate code asap. From 1940f0880f1440ac15440adbb63c560b1baab4a7 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 29 Nov 2021 16:57:40 +0200 Subject: [PATCH 09/85] Add amvp unipredictions to both lists if reference picture is present --- src/search_inter.c | 350 ++++++++++++++++++++++----------------------- 1 file changed, 174 insertions(+), 176 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 514ca1a9..4257ff09 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1223,206 +1223,204 @@ static void search_pu_inter_ref(inter_search_info_t *info, { const kvz_config *cfg = &info->state->encoder_control->cfg; - // which list, L0 or L1, ref_idx is in and in what index - int8_t ref_list = -1; - // the index of the ref_idx in L0 or L1 list - int8_t LX_idx; - // max value of LX_idx plus one - const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0], - info->state->frame->ref_LX_size[1]); + // Reference picture might be in both lists + bool ref_list_active[2] = { false, false }; + // Reference picture indices in L0 and L1 lists + int8_t ref_list_idx[2] = { -1, -1 }; - for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++) - { - // check if ref_idx is in L0 - if (LX_idx < info->state->frame->ref_LX_size[0] && - info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) { - ref_list = 0; - break; - } - - // check if ref_idx is in L1 - if (LX_idx < info->state->frame->ref_LX_size[1] && - info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) { - ref_list = 1; - break; - } - } - // ref_idx has to be found in either L0 or L1 - assert(LX_idx < LX_IDX_MAX_PLUS_1); - - // store temp values to be stored back later - int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; - - // Get MV candidates - cur_cu->inter.mv_ref[ref_list] = LX_idx; - - kvz_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - cur_cu, - lcu, - ref_list); - - // store old values back - cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - - vector2d_t mv = { 0, 0 }; - - // Take starting point for MV search from previous frame. - // When temporal motion vector candidates are added, there is probably - // no point to this anymore, but for now it helps. - const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); - const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); - const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; - const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); - if (ref_cu->type == CU_INTER) { - vector2d_t mv_previous = { 0, 0 }; - if (ref_cu->inter.mv_dir & 1) { - mv_previous.x = ref_cu->inter.mv[0][0]; - mv_previous.y = ref_cu->inter.mv[0][1]; - } - else { - mv_previous.x = ref_cu->inter.mv[1][0]; - mv_previous.y = ref_cu->inter.mv[1][1]; - } - // Apply mv scaling if neighbor poc is available - if (info->state->frame->ref_LX_size[ref_list] > 0) { - // When there are reference pictures from the future (POC > current POC) - // in L0 or L1, the primary list for the colocated PU is the inverse of - // collocated_from_l0_flag. Otherwise it is equal to reflist. - // - // Kvazaar always sets collocated_from_l0_flag so the list is L1 when - // there are future references. - int col_list = ref_list; - for (int i = 0; i < info->state->frame->ref->used_size; i++) { - if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { - col_list = 1; - break; - } + // Check if ref picture is present in the lists + for (int ref_list = 0; ref_list < 2; ++ref_list) { + for (int i = 0; i < info->state->frame->ref_LX_size[ref_list]; ++i) { + if (info->state->frame->ref_LX[ref_list][i] == info->ref_idx) { + ref_list_active[ref_list] = true; + ref_list_idx[ref_list] = i; + break; } - if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { - // Use the other list if the colocated PU does not have a MV for the - // primary list. - col_list = 1 - col_list; - } - - uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; - // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument - apply_mv_scaling( - info->state->frame->poc, - info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], - info->state->frame->ref->pocs[neighbor_poc_index], - info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ - info->state->frame->ref->ref_LXs[neighbor_poc_index] - [col_list] - [ref_cu->inter.mv_ref[col_list]] - ], - &mv_previous - ); - } - - // Check if the mv is valid after scaling - if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - mv = mv_previous; } } - int search_range = 32; - switch (cfg->ime_algorithm) { - case KVZ_IME_FULL64: search_range = 64; break; - case KVZ_IME_FULL32: search_range = 32; break; - case KVZ_IME_FULL16: search_range = 16; break; - case KVZ_IME_FULL8: search_range = 8; break; - default: break; - } + // Must find at least one reference picture + assert(ref_list_active[0] || ref_list_active[1]); - info->best_cost = MAX_DOUBLE; + // TODO: remove + double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, mv); - break; + for (int ref_list = 1; ref_list >= 0; --ref_list) { + if (ref_list_active[ref_list]) { - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, mv); - break; + int LX_idx = ref_list_idx[ref_list]; - case KVZ_IME_DIA: - diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; + // store temp values to be stored back later + int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; - default: - hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; - } + // Get MV candidates + cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; - if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { - search_frac(info); - - } else if (info->best_cost < MAX_DOUBLE) { - // Recalculate inter cost with SATD. - info->best_cost = kvz_image_calc_satd( - info->state->tile->frame->source, - info->ref, + kvz_inter_get_mv_cand(info->state, info->origin.x, info->origin.y, - info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2), - info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), info->width, - info->height); - info->best_cost += info->best_bitcost * info->state->lambda_sqrt; - } + info->height, + info->mv_cand, + cur_cu, + lcu, + ref_list); - mv = info->best_mv; + // store old values back + cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - // Only check when candidates are different - uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); - info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + vector2d_t mv = { 0, 0 }; - if (info->best_cost < *inter_cost) { - // Map reference index to L0/L1 pictures - cur_cu->inter.mv_dir = ref_list+1; + // Take starting point for MV search from previous frame. + // When temporal motion vector candidates are added, there is probably + // no point to this anymore, but for now it helps. + const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); + const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); + const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; + const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); + if (ref_cu->type == CU_INTER) { + vector2d_t mv_previous = { 0, 0 }; + if (ref_cu->inter.mv_dir & 1) { + mv_previous.x = ref_cu->inter.mv[0][0]; + mv_previous.y = ref_cu->inter.mv[0][1]; + } else { + mv_previous.x = ref_cu->inter.mv[1][0]; + mv_previous.y = ref_cu->inter.mv[1][1]; + } + // Apply mv scaling if neighbor poc is available + if (info->state->frame->ref_LX_size[ref_list] > 0) { + // When there are reference pictures from the future (POC > current POC) + // in L0 or L1, the primary list for the colocated PU is the inverse of + // collocated_from_l0_flag. Otherwise it is equal to reflist. + // + // Kvazaar always sets collocated_from_l0_flag so the list is L1 when + // there are future references. + int col_list = ref_list; + for (int i = 0; i < info->state->frame->ref->used_size; i++) { + if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { + col_list = 1; + break; + } + } + if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { + // Use the other list if the colocated PU does not have a MV for the + // primary list. + col_list = 1 - col_list; + } - cur_cu->merged = false; - cur_cu->skipped = false; - cur_cu->inter.mv_ref[ref_list] = LX_idx; - cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; - cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; + uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; + // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument + apply_mv_scaling( + info->state->frame->poc, + info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], + info->state->frame->ref->pocs[neighbor_poc_index], + info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ + info->state->frame->ref->ref_LXs[neighbor_poc_index] + [col_list] + [ref_cu->inter.mv_ref[col_list]] + ], + &mv_previous + ); + } - CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); + // Check if the mv is valid after scaling + if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { + mv = mv_previous; + } + } - *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost; - } + int search_range = 32; + switch (cfg->ime_algorithm) { + case KVZ_IME_FULL64: search_range = 64; break; + case KVZ_IME_FULL32: search_range = 32; break; + case KVZ_IME_FULL16: search_range = 16; break; + case KVZ_IME_FULL8: search_range = 8; break; + default: break; + } + info->best_cost = MAX_DOUBLE; - // Update best unipreds for biprediction - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { - // Map reference index to L0/L1 pictures - blk_stats_map_t *cur_map = &amvp[ref_list]; - blk_stats_t *entry = &cur_map->stats[cur_map->size]; - cu_info_t *pb = &entry->blk; - pb->merged = false; - pb->skipped = false; - pb->inter.mv_dir = ref_list + 1; - pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)mv.x; - pb->inter.mv[ref_list][1] = (int16_t)mv.y; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, mv); + break; - CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, mv); + break; - entry->cost = info->best_cost; - entry->bits = info->best_bitcost; - cur_map->size++; + case KVZ_IME_DIA: + diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + break; + + default: + hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + break; + } + + if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { + search_frac(info); + + } else if (info->best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + info->best_cost = kvz_image_calc_satd( + info->state->tile->frame->source, + info->ref, + info->origin.x, + info->origin.y, + info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2), + info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), + info->width, + info->height); + info->best_cost += info->best_bitcost * info->state->lambda_sqrt; + } + + mv = info->best_mv; + + // Only check when candidates are different + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); + if (valid_mv) { + if (info->best_cost < *inter_cost) { + // Map reference index to L0/L1 pictures + cur_cu->inter.mv_dir = ref_list + 1; + + cur_cu->merged = false; + cur_cu->skipped = false; + cur_cu->inter.mv_ref[ref_list] = LX_idx; + cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; + cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; + CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); + + *inter_cost = info->best_cost; + *inter_bitcost = info->best_bitcost; + } + + // Map reference index to L0/L1 pictures + blk_stats_map_t *cur_map = &amvp[ref_list]; + blk_stats_t *entry = &cur_map->stats[cur_map->size]; + cu_info_t *pb = &entry->blk; + pb->merged = false; + pb->skipped = false; + pb->inter.mv_dir = ref_list + 1; + pb->inter.mv_ref[ref_list] = LX_idx; + pb->inter.mv[ref_list][0] = (int16_t)mv.x; + pb->inter.mv[ref_list][1] = (int16_t)mv.y; + CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + + entry->cost = info->best_cost; + entry->bits = info->best_bitcost; + cur_map->size++; + } + } } } From 48773b0d25e61f182ad843ef33b7a38f3c0197e0 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 30 Nov 2021 00:19:25 +0200 Subject: [PATCH 10/85] Replace and relocate deprecated cost and mode parameter tracking. --- src/search_inter.c | 52 +++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 4257ff09..7cbf882d 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1217,8 +1217,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, int depth, lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost, blk_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1242,10 +1240,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Must find at least one reference picture assert(ref_list_active[0] || ref_list_active[1]); - // TODO: remove - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - - for (int ref_list = 1; ref_list >= 0; --ref_list) { + for (int ref_list = 0; ref_list < 2; ++ref_list) { if (ref_list_active[ref_list]) { int LX_idx = ref_list_idx[ref_list]; @@ -1388,21 +1383,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Update best unipreds for biprediction bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { - if (info->best_cost < *inter_cost) { - // Map reference index to L0/L1 pictures - cur_cu->inter.mv_dir = ref_list + 1; - - cur_cu->merged = false; - cur_cu->skipped = false; - cur_cu->inter.mv_ref[ref_list] = LX_idx; - cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; - cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; - CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); - - *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost; - } + if (valid_mv && info->best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures blk_stats_map_t *cur_map = &amvp[ref_list]; @@ -1785,12 +1766,33 @@ static void search_pu_inter(encoder_state_t * const state, info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, amvp); + search_pu_inter_ref(&info, depth, lcu, cur_cu, amvp); } kvz_sort_indices_by_cost(&amvp[0]); kvz_sort_indices_by_cost(&amvp[1]); + int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; + double best_cost_L0 = MAX_DOUBLE; + double best_cost_L1 = MAX_DOUBLE; + if (amvp[0].size > 0) best_cost_L0 = amvp[0].stats[best_idx[0]].cost; + if (amvp[1].size > 0) best_cost_L1 = amvp[1].stats[best_idx[1]].cost; + int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; + int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; + + cu_info_t *best_unipred[2] = { + &amvp[0].stats[best_idx[0]].blk, + &amvp[1].stats[best_idx[1]].blk + }; + + // Set best valid unipred to cur_cu + if (best_cost < MAX_DOUBLE) { + // Map reference index to L0/L1 pictures + *cur_cu = *best_unipred[best_list]; + *inter_cost = amvp[best_list].stats[best_idx[best_list]].cost; + *inter_bitcost = amvp[best_list].stats[best_idx[best_list]].bits; + } + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -1808,12 +1810,6 @@ static void search_pu_inter(encoder_state_t * const state, inter_merge_cand_t *merge_cand = info.merge_cand; - int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; - cu_info_t *best_unipred[2] = { - &amvp[0].stats[best_idx[0]].blk, - &amvp[1].stats[best_idx[1]].blk - }; - int16_t mv[2][2]; mv[0][0] = best_unipred[0]->inter.mv[0][0]; mv[0][1] = best_unipred[0]->inter.mv[0][1]; From 94096dd1755618c38a024b73cc944b4650bdfce0 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 30 Nov 2021 00:34:34 +0200 Subject: [PATCH 11/85] Ignore merge candidates when computing AMVP motion vector costs. --- src/search_inter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 7cbf882d..474e3883 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -234,8 +234,8 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) info->state, x, y, 2, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcost ); @@ -1068,8 +1068,8 @@ static void search_frac(inter_search_info_t *info) costs[0] += info->mvd_cost_func(state, mv.x, mv.y, 2, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[0]); best_cost = costs[0]; @@ -1128,8 +1128,8 @@ static void search_frac(inter_search_info_t *info) mv.y + pattern[j]->y, mv_shift, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[j] ); From 8406942d06fd5885d4fc1794cfe1591fc6aab036 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 30 Nov 2021 19:15:36 +0200 Subject: [PATCH 12/85] Improve the new data structure a bit. Use also for merge candidates. --- src/search.c | 12 +++--- src/search.h | 29 +++++++------ src/search_inter.c | 103 +++++++++++++++++++++------------------------ 3 files changed, 70 insertions(+), 74 deletions(-) diff --git a/src/search.c b/src/search.c index 385c4981..c0f32034 100644 --- a/src/search.c +++ b/src/search.c @@ -443,18 +443,18 @@ void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t /** * \brief Sort indices to ascending order according to costs. */ -void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map) +void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map) { // Size of sorted arrays is expected to be "small". No need for faster algorithm. for (uint8_t i = 1; i < map->size; ++i) { - const int8_t cur_idx = map->idx[i]; - const double cur_cost = map->stats[cur_idx].cost; + const int8_t cur_indx = map->indx[i]; + const double cur_cost = map->cost[cur_indx]; uint8_t j = i; - while (j > 0 && cur_cost < map->stats[map->idx[j - 1]].cost) { - map->idx[j] = map->idx[j - 1]; + while (j > 0 && cur_cost < map->cost[map->indx[j - 1]]) { + map->indx[j] = map->indx[j - 1]; --j; } - map->idx[j] = cur_idx; + map->indx[j] = cur_indx; } } diff --git a/src/search.h b/src/search.h index fe6d7f5d..9617e7b9 100644 --- a/src/search.h +++ b/src/search.h @@ -44,22 +44,27 @@ #include "image.h" #include "constraint.h" -typedef struct blk_stats_t { - cu_info_t blk; // list of blocks - double cost; // list of RD costs - uint32_t bits; // list of bit costs -} blk_stats_t; + /** + * \brief Data collected during search processes. + * + * The intended use is to collect statistics of the + * searched coding/prediction units. Data related to + * a specific unit is found at index i. The arrays + * should be indexed by elements of the "indx" array + * that will be sorted by the RD costs of the units. + */ +typedef struct unit_stats_map_t { -typedef struct blk_stats_map_t { - - blk_stats_t *stats; // list of block statistics entries - int8_t *idx; // list of indices to block stats (to be sorted by costs) - int size; // number of active elements in the lists -} blk_stats_map_t; + cu_info_t unit[MAX_REF_PIC_COUNT]; //!< list of searched units + double cost[MAX_REF_PIC_COUNT]; //!< list of matching RD costs + uint32_t bits[MAX_REF_PIC_COUNT]; //!< list of matching bit costs + int8_t indx[MAX_REF_PIC_COUNT]; //!< list of indices to elements in the other arrays + int size; //!< number of active elements in the lists +} unit_stats_map_t; void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); -void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map); +void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index 474e3883..e8272b2b 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1217,7 +1217,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, int depth, lcu_t *lcu, cu_info_t *cur_cu, - blk_stats_map_t *amvp) + unit_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1386,9 +1386,9 @@ static void search_pu_inter_ref(inter_search_info_t *info, if (valid_mv && info->best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures - blk_stats_map_t *cur_map = &amvp[ref_list]; - blk_stats_t *entry = &cur_map->stats[cur_map->size]; - cu_info_t *pb = &entry->blk; + unit_stats_map_t *cur_map = &amvp[ref_list]; + int entry = cur_map->size; + cu_info_t *pb = &cur_map->unit[entry]; pb->merged = false; pb->skipped = false; pb->inter.mv_dir = ref_list + 1; @@ -1397,8 +1397,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, pb->inter.mv[ref_list][1] = (int16_t)mv.y; CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); - entry->cost = info->best_cost; - entry->bits = info->best_bitcost; + cur_map->cost[entry] = info->best_cost; + cur_map->bits[entry] = info->best_bitcost; cur_map->size++; } } @@ -1643,16 +1643,14 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_cu, 0, 0); CU_SET_MV_CAND(cur_cu, 1, 0); - // Merge Analysis starts here - int8_t mrg_cands[MRG_MAX_NUM_CANDS]; - double mrg_costs[MRG_MAX_NUM_CANDS]; - for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - mrg_cands[i] = -1; - mrg_costs[i] = MAX_DOUBLE; - } - cu_info_t orig_cu = *cur_cu; - int num_rdo_cands = 0; + + // Merge Analysis starts here + unit_stats_map_t merge = { .size = 0 }; + for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { + merge.indx[i] = -1; + merge.cost[i] = MAX_DOUBLE; + } // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { @@ -1672,8 +1670,8 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - mrg_cands, - num_rdo_cands); + merge.indx, + merge.size); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list @@ -1687,23 +1685,29 @@ static void search_pu_inter(encoder_state_t * const state, } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - mrg_costs[num_rdo_cands] = kvz_satd_any_size(width, height, + + merge.cost[merge.size] = kvz_satd_any_size(width, height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); // Add cost of coding the merge index - mrg_costs[num_rdo_cands] += merge_idx * info.state->lambda_sqrt; + merge.cost[merge.size] += merge_idx * info.state->lambda_sqrt; + merge.bits[merge.size] = merge_idx; + merge.indx[merge.size] = merge.size; - mrg_cands[num_rdo_cands] = merge_idx; - num_rdo_cands++; + merge.unit[merge.size] = *cur_cu; + merge.unit[merge.size].type = CU_INTER; + merge.unit[merge.size].merge_idx = merge_idx; + merge.unit[merge.size].merged = true; + merge.unit[merge.size].skipped = false; + + merge.size++; } - // Sort candidates by cost - kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands); + kvz_sort_indices_by_cost(&merge); - // Limit by availability - // TODO: Do not limit to just 1 - num_rdo_cands = MIN(1, num_rdo_cands); + // Try early skip decision on just one merge candidate if available + int num_rdo_cands = MIN(1, merge.size); // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1714,7 +1718,7 @@ static void search_pu_inter(encoder_state_t * const state, // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = mrg_cands[merge_rdo_idx]; + int merge_idx = merge.unit[merge.indx[merge_rdo_idx]].merge_idx; cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; @@ -1748,17 +1752,12 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - blk_stats_t stats[2][MAX_REF_PIC_COUNT]; - int8_t idx[2][MAX_REF_PIC_COUNT]; - blk_stats_map_t amvp[2]; + unit_stats_map_t amvp[2] = { { .size = 0 }, { .size = 0 } }; for (int ref_list = 0; ref_list < 2; ++ref_list) { - amvp[ref_list].stats = stats[ref_list]; - amvp[ref_list].idx = idx [ref_list]; - amvp[ref_list].size = 0; for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { - amvp[ref_list].stats[i].blk = orig_cu; - amvp[ref_list].idx[i] = i; + amvp[ref_list].unit[i] = orig_cu; // TODO: only initialize what is necessary + amvp[ref_list].indx[i] = i; } } @@ -1772,25 +1771,25 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_indices_by_cost(&amvp[0]); kvz_sort_indices_by_cost(&amvp[1]); - int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; + int best_idx[2] = { amvp[0].indx[0], amvp[1].indx[0] }; double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; - if (amvp[0].size > 0) best_cost_L0 = amvp[0].stats[best_idx[0]].cost; - if (amvp[1].size > 0) best_cost_L1 = amvp[1].stats[best_idx[1]].cost; + if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_idx[0]]; + if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_idx[1]]; int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; cu_info_t *best_unipred[2] = { - &amvp[0].stats[best_idx[0]].blk, - &amvp[1].stats[best_idx[1]].blk + &amvp[0].unit[best_idx[0]], + &amvp[1].unit[best_idx[1]] }; // Set best valid unipred to cur_cu if (best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures *cur_cu = *best_unipred[best_list]; - *inter_cost = amvp[best_list].stats[best_idx[best_list]].cost; - *inter_bitcost = amvp[best_list].stats[best_idx[best_list]].bits; + *inter_cost = amvp[best_list].cost[best_idx[best_list]]; + *inter_bitcost = amvp[best_list].bits[best_idx[best_list]]; } // Search bi-pred positions @@ -1907,21 +1906,13 @@ static void search_pu_inter(encoder_state_t * const state, } // Compare best merge cost to amvp cost - if (mrg_costs[0] < *inter_cost) { - *inter_cost = mrg_costs[0]; + int best_merge_indx = merge.indx[0]; + int best_merge_cost = merge.cost[best_merge_indx]; + + if (merge.size > 0 && best_merge_cost < *inter_cost) { + *inter_cost = best_merge_cost; *inter_bitcost = 0; // TODO: Check this - int merge_idx = mrg_cands[0]; - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; - cur_cu->merged = true; - cur_cu->skipped = false; + *cur_cu = merge.unit[best_merge_indx]; } if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { From aca91920545df23b2d28e7cee39d7aa596368424 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 20:10:36 +0200 Subject: [PATCH 13/85] Move cu_info_t initializations to search_pu_inter. Rename cur_cu cur_pu. --- src/search_inter.c | 131 ++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 67 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index e8272b2b..0c079a42 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1617,7 +1617,11 @@ static void search_pu_inter(encoder_state_t * const state, const int x_local = SUB_SCU(x); const int y_local = SUB_SCU(y); - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_NOTSET; + cur_pu->part_size = part_mode; + cur_pu->depth = depth; + cur_pu->qp = state->qp; inter_search_info_t info = { .state = state, @@ -1640,10 +1644,8 @@ static void search_pu_inter(encoder_state_t * const state, ); // Default to candidate 0 - CU_SET_MV_CAND(cur_cu, 0, 0); - CU_SET_MV_CAND(cur_cu, 1, 0); - - cu_info_t orig_cu = *cur_cu; + CU_SET_MV_CAND(cur_pu, 0, 0); + CU_SET_MV_CAND(cur_pu, 1, 0); // Merge Analysis starts here unit_stats_map_t merge = { .size = 0 }; @@ -1656,18 +1658,18 @@ static void search_pu_inter(encoder_state_t * const state, for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; - cur_cu->inter.mv_dir = cur_cand->dir; - cur_cu->inter.mv_ref[0] = cur_cand->ref[0]; - cur_cu->inter.mv_ref[1] = cur_cand->ref[1]; - cur_cu->inter.mv[0][0] = cur_cand->mv[0][0]; - cur_cu->inter.mv[0][1] = cur_cand->mv[0][1]; - cur_cu->inter.mv[1][0] = cur_cand->mv[1][0]; - cur_cu->inter.mv[1][1] = cur_cand->mv[1][1]; + cur_pu->inter.mv_dir = cur_cand->dir; + cur_pu->inter.mv_ref[0] = cur_cand->ref[0]; + cur_pu->inter.mv_ref[1] = cur_cand->ref[1]; + cur_pu->inter.mv[0][0] = cur_cand->mv[0][0]; + cur_pu->inter.mv[0][1] = cur_cand->mv[0][1]; + cur_pu->inter.mv[1][0] = cur_cand->mv[1][0]; + cur_pu->inter.mv[1][1] = cur_cand->mv[1][1]; // If bipred is not enabled, do not try candidates with mv_dir == 3. // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. - if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; - if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue; + if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; + if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, merge.indx, @@ -1675,10 +1677,10 @@ static void search_pu_inter(encoder_state_t * const state, // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list - bool active_L0 = cur_cu->inter.mv_dir & 1; - bool active_L1 = cur_cu->inter.mv_dir & 2; - if (active_L0 && !fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || - active_L1 && !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || + bool active_L0 = cur_pu->inter.mv_dir & 1; + bool active_L1 = cur_pu->inter.mv_dir & 2; + if (active_L0 && !fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || + active_L1 && !fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || is_duplicate) { continue; @@ -1695,7 +1697,7 @@ static void search_pu_inter(encoder_state_t * const state, merge.bits[merge.size] = merge_idx; merge.indx[merge.size] = merge.size; - merge.unit[merge.size] = *cur_cu; + merge.unit[merge.size] = *cur_pu; merge.unit[merge.size].type = CU_INTER; merge.unit[merge.size].merge_idx = merge_idx; merge.unit[merge.size].merged = true; @@ -1711,7 +1713,7 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { // Reconstruct blocks with merge candidate. @@ -1719,27 +1721,27 @@ static void search_pu_inter(encoder_state_t * const state, // and chroma exists. // Early terminate if merge candidate with zero CBF is found. int merge_idx = merge.unit[merge.indx[merge_rdo_idx]].merge_idx; - cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); kvz_inter_recon_cu(state, lcu, x, y, width, true, false); - kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); - if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) { + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { continue; } else if (has_chroma) { kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true); - if (!cbf_is_set_any(cur_cu->cbf, depth)) { - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->skipped = true; + kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_INTER; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; *inter_cost = 0.0; // TODO: Check this *inter_bitcost = merge_idx; // TODO: Check this return; @@ -1756,7 +1758,7 @@ static void search_pu_inter(encoder_state_t * const state, for (int ref_list = 0; ref_list < 2; ++ref_list) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { - amvp[ref_list].unit[i] = orig_cu; // TODO: only initialize what is necessary + amvp[ref_list].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[ref_list].indx[i] = i; } } @@ -1765,7 +1767,7 @@ static void search_pu_inter(encoder_state_t * const state, info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, amvp); + search_pu_inter_ref(&info, depth, lcu, cur_pu, amvp); } kvz_sort_indices_by_cost(&amvp[0]); @@ -1787,7 +1789,7 @@ static void search_pu_inter(encoder_state_t * const state, // Set best valid unipred to cur_cu if (best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures - *cur_cu = *best_unipred[best_list]; + *cur_pu = *best_unipred[best_list]; *inter_cost = amvp[best_list].cost[best_idx[best_list]]; *inter_bitcost = amvp[best_list].bits[best_idx[best_list]]; } @@ -1856,42 +1858,42 @@ static void search_pu_inter(encoder_state_t * const state, cost += info.state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; + cur_pu->inter.mv_dir = 3; - cur_cu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; - cur_cu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; + cur_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + cur_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - cur_cu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; - cur_cu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; - cur_cu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; - cur_cu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; - cur_cu->merged = 0; + cur_pu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; + cur_pu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; + cur_pu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; + cur_pu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; + cur_pu->merged = 0; // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) + if (merge_cand[merge_idx].mv[0][0] == cur_pu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == cur_pu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == cur_pu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == cur_pu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == cur_pu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == cur_pu->inter.mv_ref[1]) { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; + cur_pu->merged = 1; + cur_pu->merge_idx = merge_idx; break; } } // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist); + kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_pu, lcu, reflist); int cu_mv_cand = select_mv_cand( info.state, info.mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], + cur_pu->inter.mv[reflist][0], + cur_pu->inter.mv[reflist][1], NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); + CU_SET_MV_CAND(cur_pu, reflist, cu_mv_cand); } *inter_cost = cost; @@ -1901,7 +1903,7 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: this probably should have a separate command line option if (cfg->rdo == 3) { - search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost); + search_pu_inter_bipred(&info, depth, lcu, cur_pu, inter_cost, inter_bitcost); } } @@ -1912,11 +1914,11 @@ static void search_pu_inter(encoder_state_t * const state, if (merge.size > 0 && best_merge_cost < *inter_cost) { *inter_cost = best_merge_cost; *inter_bitcost = 0; // TODO: Check this - *cur_cu = merge.unit[best_merge_indx]; + *cur_pu = merge.unit[best_merge_indx]; } - if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { - assert(fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1])); + if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } } @@ -2043,12 +2045,6 @@ void kvz_search_cu_smp(encoder_state_t * const state, const int y_pu = PU_GET_Y(part_mode, width, y_local, i); const int width_pu = PU_GET_W(part_mode, width, i); const int height_pu = PU_GET_H(part_mode, width, i); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); - - cur_pu->type = CU_INTER; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->qp = state->qp; double cost = MAX_DOUBLE; uint32_t bitcost = MAX_INT; @@ -2065,6 +2061,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost += cost; *inter_bitcost += bitcost; + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); From 5edb82648a0deb6bb3add740d15c3e4f332e3c0c Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 20:20:40 +0200 Subject: [PATCH 14/85] More intuitive logic for computing RD costs and bit costs for SMP --- src/search_inter.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 0c079a42..406c6de2 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2071,15 +2071,6 @@ void kvz_search_cu_smp(encoder_state_t * const state, } } - // Calculate more accurate cost when needed - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); - } - // Count bits spent for coding the partition mode. int smp_extra_bits = 1; // horizontal or vertical if (state->encoder_control->cfg.amp_enable) { @@ -2092,6 +2083,16 @@ void kvz_search_cu_smp(encoder_state_t * const state, // coding the CBF. smp_extra_bits += 6; - *inter_cost += (state->encoder_control->cfg.rdo >= 2 ? state->lambda : state->lambda_sqrt) * smp_extra_bits; *inter_bitcost += smp_extra_bits; + + // Calculate more accurate cost when needed + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, + x, y, depth, + lcu, + inter_cost, + inter_bitcost); + } else { + *inter_cost += state->lambda_sqrt * smp_extra_bits; + } } From 9905cd42d6acee45f827bd0ca414f0b65190bfe4 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 20:23:21 +0200 Subject: [PATCH 15/85] Rename "indx" to "keys". There are too many "indices" already. --- src/search.c | 12 ++++++------ src/search.h | 6 +++--- src/search_inter.c | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/search.c b/src/search.c index c0f32034..8226e6d8 100644 --- a/src/search.c +++ b/src/search.c @@ -441,20 +441,20 @@ void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t /** - * \brief Sort indices to ascending order according to costs. + * \brief Sort keys (indices) to ascending order according to costs. */ -void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map) +void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map) { // Size of sorted arrays is expected to be "small". No need for faster algorithm. for (uint8_t i = 1; i < map->size; ++i) { - const int8_t cur_indx = map->indx[i]; + const int8_t cur_indx = map->keys[i]; const double cur_cost = map->cost[cur_indx]; uint8_t j = i; - while (j > 0 && cur_cost < map->cost[map->indx[j - 1]]) { - map->indx[j] = map->indx[j - 1]; + while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) { + map->keys[j] = map->keys[j - 1]; --j; } - map->indx[j] = cur_indx; + map->keys[j] = cur_indx; } } diff --git a/src/search.h b/src/search.h index 9617e7b9..de34755b 100644 --- a/src/search.h +++ b/src/search.h @@ -51,7 +51,7 @@ * The intended use is to collect statistics of the * searched coding/prediction units. Data related to * a specific unit is found at index i. The arrays - * should be indexed by elements of the "indx" array + * should be indexed by elements of the "keys" array * that will be sorted by the RD costs of the units. */ typedef struct unit_stats_map_t { @@ -59,12 +59,12 @@ typedef struct unit_stats_map_t { cu_info_t unit[MAX_REF_PIC_COUNT]; //!< list of searched units double cost[MAX_REF_PIC_COUNT]; //!< list of matching RD costs uint32_t bits[MAX_REF_PIC_COUNT]; //!< list of matching bit costs - int8_t indx[MAX_REF_PIC_COUNT]; //!< list of indices to elements in the other arrays + int8_t keys[MAX_REF_PIC_COUNT]; //!< list of keys (indices) to elements in the other arrays int size; //!< number of active elements in the lists } unit_stats_map_t; void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); -void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map); +void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index 406c6de2..cab20882 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1650,7 +1650,7 @@ static void search_pu_inter(encoder_state_t * const state, // Merge Analysis starts here unit_stats_map_t merge = { .size = 0 }; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - merge.indx[i] = -1; + merge.keys[i] = -1; merge.cost[i] = MAX_DOUBLE; } @@ -1672,7 +1672,7 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - merge.indx, + merge.keys, merge.size); // Don't try merge candidates that don't satisfy mv constraints. @@ -1695,7 +1695,7 @@ static void search_pu_inter(encoder_state_t * const state, // Add cost of coding the merge index merge.cost[merge.size] += merge_idx * info.state->lambda_sqrt; merge.bits[merge.size] = merge_idx; - merge.indx[merge.size] = merge.size; + merge.keys[merge.size] = merge.size; merge.unit[merge.size] = *cur_pu; merge.unit[merge.size].type = CU_INTER; @@ -1706,7 +1706,7 @@ static void search_pu_inter(encoder_state_t * const state, merge.size++; } - kvz_sort_indices_by_cost(&merge); + kvz_sort_keys_by_cost(&merge); // Try early skip decision on just one merge candidate if available int num_rdo_cands = MIN(1, merge.size); @@ -1720,7 +1720,7 @@ static void search_pu_inter(encoder_state_t * const state, // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge.unit[merge.indx[merge_rdo_idx]].merge_idx; + int merge_idx = merge.unit[merge.keys[merge_rdo_idx]].merge_idx; cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; @@ -1759,7 +1759,7 @@ static void search_pu_inter(encoder_state_t * const state, for (int ref_list = 0; ref_list < 2; ++ref_list) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { amvp[ref_list].unit[i] = *cur_pu; // TODO: only initialize what is necessary - amvp[ref_list].indx[i] = i; + amvp[ref_list].keys[i] = i; } } @@ -1770,10 +1770,10 @@ static void search_pu_inter(encoder_state_t * const state, search_pu_inter_ref(&info, depth, lcu, cur_pu, amvp); } - kvz_sort_indices_by_cost(&amvp[0]); - kvz_sort_indices_by_cost(&amvp[1]); + kvz_sort_keys_by_cost(&amvp[0]); + kvz_sort_keys_by_cost(&amvp[1]); - int best_idx[2] = { amvp[0].indx[0], amvp[1].indx[0] }; + int best_idx[2] = { amvp[0].keys[0], amvp[1].keys[0] }; double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_idx[0]]; @@ -1908,7 +1908,7 @@ static void search_pu_inter(encoder_state_t * const state, } // Compare best merge cost to amvp cost - int best_merge_indx = merge.indx[0]; + int best_merge_indx = merge.keys[0]; int best_merge_cost = merge.cost[best_merge_indx]; if (merge.size > 0 && best_merge_cost < *inter_cost) { From d28c2295dc59902fe309391a4062ab3a1f1c0cf1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 22:01:16 +0200 Subject: [PATCH 16/85] The best_* fields are no longer used to track anything. Convert costs to double. --- src/search_inter.c | 242 +++++++++++++++++++++++++-------------------- 1 file changed, 134 insertions(+), 108 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index cab20882..ba007022 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -78,19 +78,6 @@ typedef struct { kvz_mvd_cost_func *mvd_cost_func; - /** - * \brief Best motion vector among the ones tested so far - */ - vector2d_t best_mv; - /** - * \brief Cost of best_mv - */ - double best_cost; - /** - * \brief Bit cost of best_mv - */ - uint32_t best_bitcost; - /** * \brief Possible optimized SAD implementation for the width, leave as * NULL for arbitrary-width blocks @@ -203,20 +190,25 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int /** * \brief Calculate cost for an integer motion vector. * - * Updates info->best_mv, info->best_cost and info->best_bitcost to the new + * Updates best_mv, best_cost and best_bitcost to the new * motion vector if it yields a lower cost than the current one. * * If the motion vector violates the MV constraints for tiles or WPP, the * cost is not set. * - * \return true if info->best_mv was changed, false otherwise + * \return true if best_mv was changed, false otherwise */ -static bool check_mv_cost(inter_search_info_t *info, int x, int y) +static bool check_mv_cost(inter_search_info_t *info, + int x, + int y, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { if (!intmv_within_tile(info, x, y)) return false; uint32_t bitcost = 0; - uint32_t cost = kvz_image_calc_sad( + double cost = kvz_image_calc_sad( info->pic, info->ref, info->origin.x, @@ -228,7 +220,7 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) info->optimized_sad ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; cost += info->mvd_cost_func( info->state, @@ -240,13 +232,13 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) &bitcost ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; // Set to motion vector in quarter pixel precision. - info->best_mv.x = x * 4; - info->best_mv.y = y * 4; - info->best_cost = cost; - info->best_bitcost = bitcost; + best_mv->x = x * 4; + best_mv->y = y * 4; + *best_cost = cost; + *best_bits = bitcost; return true; } @@ -297,12 +289,16 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv) * \brief Select starting point for integer motion estimation search. * * Checks the zero vector, extra_mv and merge candidates and updates - * info->best_mv to the best one. + * best_mv to the best one. */ -static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv) +static void select_starting_point(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. - check_mv_cost(info, 0, 0); + check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv); // Change to integer precision. extra_mv.x >>= 2; @@ -310,7 +306,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv // Check mv_in if it's not one of the merge candidates. if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) { - check_mv_cost(info, extra_mv.x, extra_mv.y); + check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); } // Go through candidates @@ -322,7 +318,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv if (x == 0 && y == 0) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } @@ -432,14 +428,17 @@ static double calc_mvd_cost(const encoder_state_t *state, } -static bool early_terminate(inter_search_info_t *info) +static bool early_terminate(inter_search_info_t *info, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { static const vector2d_t small_hexbs[7] = { { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, { 0, -1 }, { -1, 0 }, { 0, 0 }, }; - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; int first_index = 0; int last_index = 3; @@ -449,9 +448,9 @@ static bool early_terminate(inter_search_info_t *info) if (info->state->encoder_control->cfg.me_early_termination == KVZ_ME_EARLY_TERMINATION_SENSITIVE) { - threshold = info->best_cost * 0.95; + threshold = *best_cost * 0.95; } else { - threshold = info->best_cost; + threshold = *best_cost; } int best_index = 6; @@ -459,7 +458,7 @@ static bool early_terminate(inter_search_info_t *info) int x = mv.x + small_hexbs[i].x; int y = mv.y + small_hexbs[i].y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -469,7 +468,7 @@ static bool early_terminate(inter_search_info_t *info) mv.y += small_hexbs[best_index].y; // If best match is not better than threshold, we stop the search. - if (info->best_cost >= threshold) { + if (*best_cost >= threshold) { return true; } @@ -484,7 +483,10 @@ void kvz_tz_pattern_search(inter_search_info_t *info, unsigned pattern_type, const int iDist, vector2d_t mv, - int *best_dist) + int *best_dist, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { assert(pattern_type < 4); @@ -586,7 +588,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info, int x = mv.x + offset.x; int y = mv.y + offset.y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -599,20 +601,27 @@ void kvz_tz_pattern_search(inter_search_info_t *info, void kvz_tz_raster_search(inter_search_info_t *info, int iSearchRange, - int iRaster) + int iRaster, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { - const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + const vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; //compute SAD values for every point in the iRaster downsampled version of the current search area for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) { for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) { - check_mv_cost(info, mv.x + x, mv.y + y); + check_mv_cost(info, mv.x + x, mv.y + y, best_cost, best_bits, best_mv); } } } -static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) +static void tz_search(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { //TZ parameters const int iSearchRange = 96; // search range for each stage @@ -624,25 +633,25 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - info->best_cost = MAX_DOUBLE; + *best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); + select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); // Check if we should stop search if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) + early_terminate(info, best_cost, best_bits, best_mv)) { return; } - vector2d_t start = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 }; // step 2, grid search int rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); // Break the loop if the last three rounds didn't produce a better MV. if (best_dist != iDist) rounds_without_improvement++; @@ -655,7 +664,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) start.y = 0; rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); if (best_dist != iDist) rounds_without_improvement++; if (rounds_without_improvement >= 3) break; @@ -665,7 +674,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //step 3, raster scan if (use_raster_scan && best_dist > iRaster) { best_dist = iRaster; - kvz_tz_raster_search(info, iSearchRange, iRaster); + kvz_tz_raster_search(info, iSearchRange, iRaster, best_cost, best_bits, best_mv); } //step 4 @@ -673,19 +682,19 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //raster refinement if (use_raster_refinement && best_dist > 0) { for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) { - start.x = info->best_mv.x >> 2; - start.y = info->best_mv.y >> 2; - kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + start.x = best_mv->x >> 2; + start.y = best_mv->y >> 2; + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } //star refinement (repeat step 2 for the current starting point) while (use_star_refinement && best_dist > 0) { best_dist = 0; - start.x = info->best_mv.x >> 2; - start.y = info->best_mv.y >> 2; + start.x = best_mv->x >> 2; + start.y = best_mv->y >> 2; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } } @@ -707,7 +716,12 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. */ -static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void hexagon_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -732,27 +746,27 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - info->best_cost = MAX_DOUBLE; + *best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); + select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); // Check if we should stop search if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) + early_terminate(info, best_cost, best_bits, best_mv)) { return; } - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. int best_index = 0; // Search the initial 7 points of the hexagon. for (int i = 1; i < 7; ++i) { - if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y)) { + if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -781,7 +795,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Iterate through the next 3 points. for (int i = 0; i < 3; ++i) { vector2d_t offset = large_hexbs[start + i]; - if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) { + if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) { best_index = start + i; } } @@ -793,7 +807,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Do the final step of the search with a small pattern. for (int i = 1; i < 9; ++i) { - check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y); + check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv); } } @@ -813,7 +827,12 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. **/ -static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void diamond_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { enum diapos { DIA_UP = 0, @@ -832,28 +851,28 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 {0, 0} }; - info->best_cost = MAX_DOUBLE; + *best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); + select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); // Check if we should stop search if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) + early_terminate(info, best_cost, best_bits, best_mv)) { return; } // current motion vector - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // current best index enum diapos best_index = DIA_CENTER; // initial search of the points of the diamond for (int i = 0; i < 5; ++i) { - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -883,7 +902,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // this is where we came from so it's checked already if (i == from_dir) continue; - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; better_found = 1; } @@ -905,12 +924,15 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 static void search_mv_full(inter_search_info_t *info, int32_t search_range, - vector2d_t extra_mv) + vector2d_t extra_mv, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // Search around the 0-vector. for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } @@ -922,7 +944,7 @@ static void search_mv_full(inter_search_info_t *info, if (!mv_in_merge(info, extra_mv)) { for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, extra_mv.x + x, extra_mv.y + y); + check_mv_cost(info, extra_mv.x + x, extra_mv.y + y, best_cost, best_bits, best_mv); } } } @@ -969,7 +991,7 @@ static void search_mv_full(inter_search_info_t *info, } if (already_tested) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } } @@ -982,7 +1004,10 @@ static void search_mv_full(inter_search_info_t *info, * Algoritm first searches 1/2-pel positions around integer mv and after best match is found, * refines the search by searching best 1/4-pel postion around best 1/2-pel position. */ -static void search_frac(inter_search_info_t *info) +static void search_frac(inter_search_info_t *info, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // Map indexes to relative coordinates in the following way: // 5 3 6 @@ -995,10 +1020,10 @@ static void search_frac(inter_search_info_t *info) }; // Set mv to pixel precision - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; - double best_cost = MAX_DOUBLE; - uint32_t best_bitcost = 0; + double cost = MAX_DOUBLE; + uint32_t bitcost = 0; uint32_t bitcosts[4] = { 0 }; unsigned best_index = 0; @@ -1072,8 +1097,8 @@ static void search_frac(inter_search_info_t *info) 0, info->ref_idx, &bitcosts[0]); - best_cost = costs[0]; - best_bitcost = bitcosts[0]; + cost = costs[0]; + bitcost = bitcosts[0]; //Set mv to half-pixel precision mv.x *= 2; @@ -1137,9 +1162,9 @@ static void search_frac(inter_search_info_t *info) } for (int j = 0; j < 4; ++j) { - if (within_tile[j] && costs[j] < best_cost) { - best_cost = costs[j]; - best_bitcost = bitcosts[j]; + if (within_tile[j] && costs[j] < cost) { + cost = costs[j]; + bitcost = bitcosts[j]; best_index = i + j; } } @@ -1165,9 +1190,9 @@ static void search_frac(inter_search_info_t *info) } } - info->best_mv = mv; - info->best_cost = best_cost; - info->best_bitcost = best_bitcost; + *best_mv = mv; + *best_cost = cost; + *best_bits = bitcost; } /** @@ -1264,7 +1289,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // store old values back cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - vector2d_t mv = { 0, 0 }; + vector2d_t best_mv = { 0, 0 }; // Take starting point for MV search from previous frame. // When temporal motion vector candidates are added, there is probably @@ -1320,7 +1345,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Check if the mv is valid after scaling if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - mv = mv_previous; + best_mv = mv_previous; } } @@ -1333,11 +1358,12 @@ static void search_pu_inter_ref(inter_search_info_t *info, default: break; } - info->best_cost = MAX_DOUBLE; + double best_cost = MAX_DOUBLE; + uint32_t best_bits = MAX_INT; switch (cfg->ime_algorithm) { case KVZ_IME_TZ: - tz_search(info, mv); + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); break; case KVZ_IME_FULL64: @@ -1345,45 +1371,45 @@ static void search_pu_inter_ref(inter_search_info_t *info, case KVZ_IME_FULL16: case KVZ_IME_FULL8: case KVZ_IME_FULL: - search_mv_full(info, search_range, mv); + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); break; case KVZ_IME_DIA: - diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); break; default: - hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); break; } - if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { - search_frac(info); + if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { + search_frac(info, &best_cost, &best_bits, &best_mv); - } else if (info->best_cost < MAX_DOUBLE) { + } else if (best_cost < MAX_DOUBLE) { // Recalculate inter cost with SATD. - info->best_cost = kvz_image_calc_satd( + best_cost = kvz_image_calc_satd( info->state->tile->frame->source, info->ref, info->origin.x, info->origin.y, - info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2), - info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), + info->state->tile->offset_x + info->origin.x + (best_mv.x >> 2), + info->state->tile->offset_y + info->origin.y + (best_mv.y >> 2), info->width, info->height); - info->best_cost += info->best_bitcost * info->state->lambda_sqrt; + best_cost += best_bits * info->state->lambda_sqrt; } - mv = info->best_mv; - // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); - info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + best_bits += cur_cu->inter.mv_dir - 1 + mv_ref_coded; // Update best unipreds for biprediction - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv && info->best_cost < MAX_DOUBLE) { + bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures unit_stats_map_t *cur_map = &amvp[ref_list]; @@ -1393,12 +1419,12 @@ static void search_pu_inter_ref(inter_search_info_t *info, pb->skipped = false; pb->inter.mv_dir = ref_list + 1; pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)mv.x; - pb->inter.mv[ref_list][1] = (int16_t)mv.y; + pb->inter.mv[ref_list][0] = (int16_t)best_mv.x; + pb->inter.mv[ref_list][1] = (int16_t)best_mv.y; CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); - cur_map->cost[entry] = info->best_cost; - cur_map->bits[entry] = info->best_bitcost; + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; cur_map->size++; } } From 574d6c45930e8b7a626093a918651ff1744ab7a1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 3 Dec 2021 22:11:49 +0200 Subject: [PATCH 17/85] Eliminate copy-paste logic from different ME algorithms. --- src/search_inter.c | 96 ++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 62 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index ba007022..cae57c61 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -633,19 +633,7 @@ static void tz_search(inter_search_info_t *info, const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - *best_cost = MAX_DOUBLE; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info, best_cost, best_bits, best_mv)) - { - return; - } - + vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 }; // step 2, grid search @@ -746,19 +734,6 @@ static void hexagon_search(inter_search_info_t *info, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - *best_cost = MAX_DOUBLE; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info, best_cost, best_bits, best_mv)) - { - return; - } - vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. @@ -850,19 +825,6 @@ static void diamond_search(inter_search_info_t *info, {0, -1}, {1, 0}, {0, 1}, {-1, 0}, {0, 0} }; - - *best_cost = MAX_DOUBLE; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info, best_cost, best_bits, best_mv)) - { - return; - } // current motion vector vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; @@ -1361,34 +1323,44 @@ static void search_pu_inter_ref(inter_search_info_t *info, double best_cost = MAX_DOUBLE; uint32_t best_bits = MAX_INT; - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); - break; + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); - break; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_DIA: - diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); + break; - default: - hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + } + + if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { + search_frac(info, &best_cost, &best_bits, &best_mv); + + } } - if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { - search_frac(info, &best_cost, &best_bits, &best_mv); - - } else if (best_cost < MAX_DOUBLE) { + if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { // Recalculate inter cost with SATD. best_cost = kvz_image_calc_satd( info->state->tile->frame->source, From 70a393a3dca67d37863e14002da8900bcdc9e58f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 5 Dec 2021 00:21:09 +0200 Subject: [PATCH 18/85] Set mv candidates before cost calculations for bipred. Use the new struct for bipred. --- src/search_inter.c | 108 +++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index cae57c61..893c1ee8 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1443,11 +1443,23 @@ static void search_pu_inter_bipred(inter_search_info_t *info, continue; } - int16_t mv[2][2]; + cur_cu->inter.mv_dir = 3; + + cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; + cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + int16_t(*mv)[2] = cur_cu->inter.mv; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; + + cur_cu->merged = false; + cur_cu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); + } // Don't try merge candidates that don't satisfy mv constraints. if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) || @@ -1497,16 +1509,6 @@ static void search_pu_inter_bipred(inter_search_info_t *info, cost += info->state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; - - cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; - cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; - cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; - cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = 0; // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { @@ -1525,7 +1527,6 @@ static void search_pu_inter_bipred(inter_search_info_t *info, // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); int cu_mv_cand = select_mv_cand( info->state, info->mv_cand, @@ -1752,12 +1753,12 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - unit_stats_map_t amvp[2] = { { .size = 0 }, { .size = 0 } }; + unit_stats_map_t amvp[3] = { { .size = 0 }, { .size = 0 }, { .size = 0 } }; - for (int ref_list = 0; ref_list < 2; ++ref_list) { + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { - amvp[ref_list].unit[i] = *cur_pu; // TODO: only initialize what is necessary - amvp[ref_list].keys[i] = i; + amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary + amvp[mv_dir - 1].keys[i] = i; } } @@ -1799,6 +1800,8 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { + cu_info_t *bipred_pu = &amvp[2].unit[0]; + // Try biprediction from valid acquired unipreds. if (amvp[0].size > 0 && amvp[1].size > 0) { @@ -1809,15 +1812,27 @@ static void search_pu_inter(encoder_state_t * const state, inter_merge_cand_t *merge_cand = info.merge_cand; - int16_t mv[2][2]; + bipred_pu->inter.mv_dir = 3; + + bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + bipred_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; + + int16_t (*mv)[2] = bipred_pu->inter.mv; mv[0][0] = best_unipred[0]->inter.mv[0][0]; mv[0][1] = best_unipred[0]->inter.mv[0][1]; mv[1][0] = best_unipred[1]->inter.mv[1][0]; mv[1][1] = best_unipred[1]->inter.mv[1][1]; + + bipred_pu->merged = false; + bipred_pu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, bipred_pu, lcu, reflist); + } kvz_inter_recon_bipred(info.state, - ref->images[ref_LX[0][best_unipred[0]->inter.mv_ref[0]]], - ref->images[ref_LX[1][best_unipred[1]->inter.mv_ref[1]]], + ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], + ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], x, y, width, height, @@ -1834,74 +1849,71 @@ static void search_pu_inter(encoder_state_t * const state, uint32_t bitcost[2] = { 0, 0 }; cost += info.mvd_cost_func(info.state, - best_unipred[0]->inter.mv[0][0], - best_unipred[0]->inter.mv[0][1], + bipred_pu->inter.mv[0][0], + bipred_pu->inter.mv[0][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[0]); cost += info.mvd_cost_func(info.state, - best_unipred[1]->inter.mv[1][0], - best_unipred[1]->inter.mv[1][1], + bipred_pu->inter.mv[1][0], + bipred_pu->inter.mv[1][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[1]); const uint8_t mv_ref_coded[2] = { - best_unipred[0]->inter.mv_ref[0], - best_unipred[1]->inter.mv_ref[1] + bipred_pu->inter.mv_ref[0], + bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info.state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { - cur_pu->inter.mv_dir = 3; - - cur_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; - cur_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - - cur_pu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; - cur_pu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; - cur_pu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; - cur_pu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; - cur_pu->merged = 0; // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_pu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_pu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_pu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_pu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_pu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_pu->inter.mv_ref[1]) + if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) { - cur_pu->merged = 1; - cur_pu->merge_idx = merge_idx; + bipred_pu->merged = 1; + bipred_pu->merge_idx = merge_idx; break; } } // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_pu, lcu, reflist); int cu_mv_cand = select_mv_cand( info.state, info.mv_cand, - cur_pu->inter.mv[reflist][0], - cur_pu->inter.mv[reflist][1], + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], NULL); - CU_SET_MV_CAND(cur_pu, reflist, cu_mv_cand); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } *inter_cost = cost; *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + + *cur_pu = *bipred_pu; } } // TODO: this probably should have a separate command line option if (cfg->rdo == 3) { - search_pu_inter_bipred(&info, depth, lcu, cur_pu, inter_cost, inter_bitcost); + cu_info_t bipred_pu = *cur_pu; + double prior_cost = *inter_cost; + search_pu_inter_bipred(&info, depth, lcu, &bipred_pu, inter_cost, inter_bitcost); + + if (*inter_cost < prior_cost) { + *cur_pu = bipred_pu; + } } } From adb31ce959a8eea88417ed5d43509a67ca9d0d70 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 5 Dec 2021 16:13:01 +0200 Subject: [PATCH 19/85] Use the new struct for bipred refinement as well --- src/search_inter.c | 111 +++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 55 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 893c1ee8..7727488a 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1409,9 +1409,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, */ static void search_pu_inter_bipred(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost) + lcu_t *lcu, + unit_stats_map_t *amvp_bipred) { const image_list_t *const ref = info->state->frame->ref; uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; @@ -1443,22 +1442,24 @@ static void search_pu_inter_bipred(inter_search_info_t *info, continue; } - cur_cu->inter.mv_dir = 3; + cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size]; - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; + bipred_pu->inter.mv_dir = 3; - int16_t(*mv)[2] = cur_cu->inter.mv; + bipred_pu->inter.mv_ref[0] = merge_cand[i].ref[0]; + bipred_pu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + int16_t(*mv)[2] = bipred_pu->inter.mv; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = false; - cur_cu->skipped = false; + bipred_pu->merged = false; + bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); } // Don't try merge candidates that don't satisfy mv constraints. @@ -1508,37 +1509,35 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info->state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } + // Check every candidate to find a match + for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { + if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) + { + bipred_pu->merged = true; + bipred_pu->merge_idx = merge_idx; + break; } - - // Each motion vector has its own candidate - for (int reflist = 0; reflist < 2; reflist++) { - int cu_mv_cand = select_mv_cand( - info->state, - info->mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], - NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); - } - - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; } + + // Each motion vector has its own candidate + for (int reflist = 0; reflist < 2; reflist++) { + int cu_mv_cand = select_mv_cand( + info->state, + info->mv_cand, + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], + NULL); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); + } + + amvp_bipred->cost[amvp_bipred->size] = cost; + amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; + amvp_bipred->size++; } } @@ -1801,6 +1800,8 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { cu_info_t *bipred_pu = &amvp[2].unit[0]; + double best_bipred_cost = MAX_DOUBLE; + uint32_t best_bipred_bits = MAX_INT; // Try biprediction from valid acquired unipreds. if (amvp[0].size > 0 && amvp[1].size > 0) { @@ -1843,19 +1844,19 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - double cost = + best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); uint32_t bitcost[2] = { 0, 0 }; - cost += info.mvd_cost_func(info.state, + best_bipred_cost += info.mvd_cost_func(info.state, bipred_pu->inter.mv[0][0], bipred_pu->inter.mv[0][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[0]); - cost += info.mvd_cost_func(info.state, + best_bipred_cost += info.mvd_cost_func(info.state, bipred_pu->inter.mv[1][0], bipred_pu->inter.mv[1][1], 0, @@ -1868,9 +1869,9 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info.state->lambda_sqrt * extra_bits; + best_bipred_cost += info.state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { + if (best_bipred_cost < *inter_cost) { // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { @@ -1898,22 +1899,22 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; - - *cur_pu = *bipred_pu; + amvp[2].cost[amvp[2].size] = best_bipred_cost; + amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].size++; } } // TODO: this probably should have a separate command line option - if (cfg->rdo == 3) { - cu_info_t bipred_pu = *cur_pu; - double prior_cost = *inter_cost; - search_pu_inter_bipred(&info, depth, lcu, &bipred_pu, inter_cost, inter_bitcost); + if (cfg->rdo == 3) search_pu_inter_bipred(&info, depth, lcu, &amvp[2]); + + kvz_sort_keys_by_cost(&amvp[2]); + int best_bipred_key = amvp[2].keys[0]; - if (*inter_cost < prior_cost) { - *cur_pu = bipred_pu; - } + if (amvp[2].size > 0 && amvp[2].cost[best_bipred_key] < *inter_cost) { + *inter_cost = amvp[2].cost[best_bipred_key]; + *inter_bitcost = amvp[2].bits[best_bipred_key]; + *cur_pu = amvp[2].unit[best_bipred_key]; } } From dc4676eef1b51521caafcd269d01664a6246b135 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 15:35:13 +0200 Subject: [PATCH 20/85] Remove merge attempts from bipred functions --- src/search_inter.c | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 7727488a..d6751d38 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1509,21 +1509,6 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info->state->lambda_sqrt * extra_bits; - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) - { - bipred_pu->merged = true; - bipred_pu->merge_idx = merge_idx; - break; - } - } - // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { int cu_mv_cand = select_mv_cand( @@ -1801,7 +1786,6 @@ static void search_pu_inter(encoder_state_t * const state, cu_info_t *bipred_pu = &amvp[2].unit[0]; double best_bipred_cost = MAX_DOUBLE; - uint32_t best_bipred_bits = MAX_INT; // Try biprediction from valid acquired unipreds. if (amvp[0].size > 0 && amvp[1].size > 0) { @@ -1811,8 +1795,6 @@ static void search_pu_inter(encoder_state_t * const state, const image_list_t *const ref = info.state->frame->ref; uint8_t(*ref_LX)[16] = info.state->frame->ref_LX; - inter_merge_cand_t *merge_cand = info.merge_cand; - bipred_pu->inter.mv_dir = 3; bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; @@ -1873,21 +1855,6 @@ static void search_pu_inter(encoder_state_t * const state, if (best_bipred_cost < *inter_cost) { - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) - { - bipred_pu->merged = 1; - bipred_pu->merge_idx = merge_idx; - break; - } - } - // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { int cu_mv_cand = select_mv_cand( From 3a219146edea9a2de1ddc040765708bc5033585e Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 15:47:14 +0200 Subject: [PATCH 21/85] Rename some variables --- src/search_inter.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index d6751d38..df221bd4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1386,14 +1386,14 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Map reference index to L0/L1 pictures unit_stats_map_t *cur_map = &amvp[ref_list]; int entry = cur_map->size; - cu_info_t *pb = &cur_map->unit[entry]; - pb->merged = false; - pb->skipped = false; - pb->inter.mv_dir = ref_list + 1; - pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)best_mv.x; - pb->inter.mv[ref_list][1] = (int16_t)best_mv.y; - CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + cu_info_t *unipred_pu = &cur_map->unit[entry]; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = ref_list + 1; + unipred_pu->inter.mv_ref[ref_list] = LX_idx; + unipred_pu->inter.mv[ref_list][0] = (int16_t)best_mv.x; + unipred_pu->inter.mv[ref_list][1] = (int16_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); cur_map->cost[entry] = best_cost; cur_map->bits[entry] = best_bits; @@ -1697,13 +1697,13 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { - for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { + for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { // Reconstruct blocks with merge candidate. // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge.unit[merge.keys[merge_rdo_idx]].merge_idx; + int merge_idx = merge.unit[merge.keys[merge_key]].merge_idx; cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; @@ -1756,25 +1756,25 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[0]); kvz_sort_keys_by_cost(&amvp[1]); - int best_idx[2] = { amvp[0].keys[0], amvp[1].keys[0] }; + int best_keys[2] = { amvp[0].keys[0], amvp[1].keys[0] }; double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; - if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_idx[0]]; - if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_idx[1]]; + if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_keys[0]]; + if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_keys[1]]; int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; cu_info_t *best_unipred[2] = { - &amvp[0].unit[best_idx[0]], - &amvp[1].unit[best_idx[1]] + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] }; // Set best valid unipred to cur_cu if (best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures *cur_pu = *best_unipred[best_list]; - *inter_cost = amvp[best_list].cost[best_idx[best_list]]; - *inter_bitcost = amvp[best_list].bits[best_idx[best_list]]; + *inter_cost = amvp[best_list].cost[best_keys[best_list]]; + *inter_bitcost = amvp[best_list].bits[best_keys[best_list]]; } // Search bi-pred positions @@ -1886,13 +1886,13 @@ static void search_pu_inter(encoder_state_t * const state, } // Compare best merge cost to amvp cost - int best_merge_indx = merge.keys[0]; - int best_merge_cost = merge.cost[best_merge_indx]; + int best_merge_key = merge.keys[0]; + int best_merge_cost = merge.cost[best_merge_key]; if (merge.size > 0 && best_merge_cost < *inter_cost) { *inter_cost = best_merge_cost; *inter_bitcost = 0; // TODO: Check this - *cur_pu = merge.unit[best_merge_indx]; + *cur_pu = merge.unit[best_merge_key]; } if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { From 2b9b398524c555b4bd6463f5bd47cc57e2bc5d16 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 16:05:23 +0200 Subject: [PATCH 22/85] Remove now unnecessary state store/restore --- src/search_inter.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index df221bd4..dcb45d2f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1232,9 +1232,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, int LX_idx = ref_list_idx[ref_list]; - // store temp values to be stored back later - int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; - // Get MV candidates cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; @@ -1248,9 +1245,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, lcu, ref_list); - // store old values back - cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - vector2d_t best_mv = { 0, 0 }; // Take starting point for MV search from previous frame. From 4d02b69c4e75d98d149a93c76cd5cf69fdcb163c Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 19:34:05 +0200 Subject: [PATCH 23/85] Set CU type in inter search functions --- src/search_inter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/search_inter.c b/src/search_inter.c index dcb45d2f..8db16cec 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1381,6 +1381,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, unit_stats_map_t *cur_map = &amvp[ref_list]; int entry = cur_map->size; cu_info_t *unipred_pu = &cur_map->unit[entry]; + unipred_pu->type = CU_INTER; unipred_pu->merged = false; unipred_pu->skipped = false; unipred_pu->inter.mv_dir = ref_list + 1; @@ -1514,6 +1515,8 @@ static void search_pu_inter_bipred(inter_search_info_t *info, CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } + bipred_pu->type = CU_INTER; + amvp_bipred->cost[amvp_bipred->size] = cost; amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; amvp_bipred->size++; From 0b223b24f21d43a42aadf514735a7725aa1129be Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 22:37:27 +0200 Subject: [PATCH 24/85] Fix comment --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 8db16cec..3c6f035d 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1732,7 +1732,7 @@ static void search_pu_inter(encoder_state_t * const state, // AMVP search starts here - // Store unipred information of L0 and L1 for biprediction + // Store information of L0, L1, and bipredictions // Best cost will be left at MAX_DOUBLE if no valid CU is found unit_stats_map_t amvp[3] = { { .size = 0 }, { .size = 0 }, { .size = 0 } }; From bdece66dc40c4873c7fd1e0b92b17eaf21c8f919 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 23:12:47 +0200 Subject: [PATCH 25/85] Compare the final costs only once and then set the current CU --- src/search_inter.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 3c6f035d..9d077acb 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1740,6 +1740,7 @@ static void search_pu_inter(encoder_state_t * const state, for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[mv_dir - 1].keys[i] = i; + amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; } } @@ -1766,14 +1767,6 @@ static void search_pu_inter(encoder_state_t * const state, &amvp[1].unit[best_keys[1]] }; - // Set best valid unipred to cur_cu - if (best_cost < MAX_DOUBLE) { - // Map reference index to L0/L1 pictures - *cur_pu = *best_unipred[best_list]; - *inter_cost = amvp[best_list].cost[best_keys[best_list]]; - *inter_bitcost = amvp[best_list].bits[best_keys[best_list]]; - } - // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -1850,7 +1843,7 @@ static void search_pu_inter(encoder_state_t * const state, const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; best_bipred_cost += info.state->lambda_sqrt * extra_bits; - if (best_bipred_cost < *inter_cost) { + if (best_bipred_cost < MAX_DOUBLE) { // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { @@ -1873,28 +1866,38 @@ static void search_pu_inter(encoder_state_t * const state, if (cfg->rdo == 3) search_pu_inter_bipred(&info, depth, lcu, &amvp[2]); kvz_sort_keys_by_cost(&amvp[2]); - int best_bipred_key = amvp[2].keys[0]; + } - if (amvp[2].size > 0 && amvp[2].cost[best_bipred_key] < *inter_cost) { - *inter_cost = amvp[2].cost[best_bipred_key]; - *inter_bitcost = amvp[2].bits[best_bipred_key]; - *cur_pu = amvp[2].unit[best_bipred_key]; + cu_info_t* best_inter_pu = NULL; + + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; } } // Compare best merge cost to amvp cost - int best_merge_key = merge.keys[0]; - int best_merge_cost = merge.cost[best_merge_key]; + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { - if (merge.size > 0 && best_merge_cost < *inter_cost) { - *inter_cost = best_merge_cost; - *inter_bitcost = 0; // TODO: Check this - *cur_pu = merge.unit[best_merge_key]; + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this } if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } + + *cur_pu = *best_inter_pu; } /** From 3e967c0077862fe762cda39059712712bb81ebf8 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 23:30:34 +0200 Subject: [PATCH 26/85] Add missing assertion and set cu before --- src/search_inter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 9d077acb..88a896ac 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1893,11 +1893,15 @@ static void search_pu_inter(encoder_state_t * const state, *inter_bitcost = 0; // TODO: Check this } - if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { + *cur_pu = *best_inter_pu; + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } - *cur_pu = *best_inter_pu; + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } } /** From 4e19f7b71e673d2f5acf50ff3bf1d6aefc55b47f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 7 Dec 2021 00:35:50 +0200 Subject: [PATCH 27/85] Move mode decision logic and current PU setting to higher-level functions --- src/search_inter.c | 322 ++++++++++++++++++++++++++++----------------- 1 file changed, 199 insertions(+), 123 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 88a896ac..76fb8d54 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -68,7 +68,7 @@ typedef struct { /** * \brief Top-left corner of the PU */ - const vector2d_t origin; + vector2d_t origin; int32_t width; int32_t height; @@ -1555,7 +1555,7 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, } /** - * \brief Update PU to have best modes at this depth. + * \brief Collect PU parameters and costs at this depth. * * \param state encoder state * \param x_cu x-coordinate of the containing CU @@ -1565,28 +1565,26 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, * \param i_pu index of the PU in the CU * \param lcu containing LCU * - * \param inter_cost Return inter cost of the best mode - * \param inter_bitcost Return inter bitcost of the best mode + * \param amvp Return searched AMVP PUs sorted by costs + * \param merge Return searched Merge PUs sorted by costs */ static void search_pu_inter(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost) + int x_cu, int y_cu, + int depth, + part_mode_t part_mode, + int i_pu, + lcu_t *lcu, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + inter_search_info_t *info) { - *inter_cost = MAX_DOUBLE; - *inter_bitcost = MAX_INT; - const kvz_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); + const int width_cu = LCU_WIDTH >> depth; + const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); + const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); + const int width = PU_GET_W(part_mode, width_cu, i_pu); + const int height = PU_GET_H(part_mode, width_cu, i_pu); // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and // nRx2N partitions. @@ -1595,31 +1593,31 @@ static void search_pu_inter(encoder_state_t * const state, // 2NxnD partitions. const bool merge_b1 = i_pu == 0 || width <= height; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); cur_pu->type = CU_NOTSET; cur_pu->part_size = part_mode; cur_pu->depth = depth; cur_pu->qp = state->qp; - inter_search_info_t info = { - .state = state, - .pic = frame->source, - .origin = { x, y }, - .width = width, - .height = height, - .mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost, - .optimized_sad = kvz_get_optimized_sad(width), - }; + + info->state = state; + info->pic = frame->source; + info->origin.x = x; + info->origin.y = y; + info->width = width; + info->height = height; + info->mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost; + info->optimized_sad = kvz_get_optimized_sad(width); // Search for merge mode candidates - info.num_merge_cand = kvz_inter_get_merge_cand( + info->num_merge_cand = kvz_inter_get_merge_cand( state, x, y, width, height, merge_a1, merge_b1, - info.merge_cand, + info->merge_cand, lcu ); @@ -1628,16 +1626,16 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_pu, 1, 0); // Merge Analysis starts here - unit_stats_map_t merge = { .size = 0 }; + merge->size = 0; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - merge.keys[i] = -1; - merge.cost[i] = MAX_DOUBLE; + merge->keys[i] = -1; + merge->cost[i] = MAX_DOUBLE; } // Check motion vector constraints and perform rough search - for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { + for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { - inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; + inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx]; cur_pu->inter.mv_dir = cur_cand->dir; cur_pu->inter.mv_ref[0] = cur_cand->ref[0]; cur_pu->inter.mv_ref[1] = cur_cand->ref[1]; @@ -1651,16 +1649,16 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; - bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - merge.keys, - merge.size); + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, + merge->keys, + merge->size); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list bool active_L0 = cur_pu->inter.mv_dir & 1; bool active_L1 = cur_pu->inter.mv_dir & 2; - if (active_L0 && !fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || - active_L1 && !fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || + if (active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || + active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || is_duplicate) { continue; @@ -1668,28 +1666,28 @@ static void search_pu_inter(encoder_state_t * const state, kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - merge.cost[merge.size] = kvz_satd_any_size(width, height, + merge->cost[merge->size] = kvz_satd_any_size(width, height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); // Add cost of coding the merge index - merge.cost[merge.size] += merge_idx * info.state->lambda_sqrt; - merge.bits[merge.size] = merge_idx; - merge.keys[merge.size] = merge.size; + merge->cost[merge->size] += merge_idx * info->state->lambda_sqrt; + merge->bits[merge->size] = merge_idx; + merge->keys[merge->size] = merge->size; - merge.unit[merge.size] = *cur_pu; - merge.unit[merge.size].type = CU_INTER; - merge.unit[merge.size].merge_idx = merge_idx; - merge.unit[merge.size].merged = true; - merge.unit[merge.size].skipped = false; + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_INTER; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; - merge.size++; + merge->size++; } - kvz_sort_keys_by_cost(&merge); + kvz_sort_keys_by_cost(merge); // Try early skip decision on just one merge candidate if available - int num_rdo_cands = MIN(1, merge.size); + int num_rdo_cands = MIN(1, merge->size); // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1700,14 +1698,14 @@ static void search_pu_inter(encoder_state_t * const state, // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge.unit[merge.keys[merge_key]].merge_idx; - cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_pu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_pu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_pu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_pu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); kvz_inter_recon_cu(state, lcu, x, y, width, true, false); kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); @@ -1722,8 +1720,11 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->type = CU_INTER; cur_pu->merge_idx = merge_idx; cur_pu->skipped = true; - *inter_cost = 0.0; // TODO: Check this - *inter_bitcost = merge_idx; // TODO: Check this + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; return; } } @@ -1732,9 +1733,9 @@ static void search_pu_inter(encoder_state_t * const state, // AMVP search starts here - // Store information of L0, L1, and bipredictions - // Best cost will be left at MAX_DOUBLE if no valid CU is found - unit_stats_map_t amvp[3] = { { .size = 0 }, { .size = 0 }, { .size = 0 } }; + amvp[0].size = 0; + amvp[1].size = 0; + amvp[2].size = 0; for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { @@ -1745,10 +1746,10 @@ static void search_pu_inter(encoder_state_t * const state, } for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { - info.ref_idx = ref_idx; - info.ref = state->frame->ref->images[ref_idx]; + info->ref_idx = ref_idx; + info->ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_pu, amvp); + search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); } kvz_sort_keys_by_cost(&amvp[0]); @@ -1782,8 +1783,8 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: logic is copy paste from search_pu_inter_bipred. // Get rid of duplicate code asap. - const image_list_t *const ref = info.state->frame->ref; - uint8_t(*ref_LX)[16] = info.state->frame->ref_LX; + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; bipred_pu->inter.mv_dir = 3; @@ -1800,10 +1801,10 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, bipred_pu, lcu, reflist); + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); } - kvz_inter_recon_bipred(info.state, + kvz_inter_recon_bipred(info->state, ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], x, y, @@ -1821,18 +1822,18 @@ static void search_pu_inter(encoder_state_t * const state, uint32_t bitcost[2] = { 0, 0 }; - best_bipred_cost += info.mvd_cost_func(info.state, + best_bipred_cost += info->mvd_cost_func(info->state, bipred_pu->inter.mv[0][0], bipred_pu->inter.mv[0][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[0]); - best_bipred_cost += info.mvd_cost_func(info.state, + best_bipred_cost += info->mvd_cost_func(info->state, bipred_pu->inter.mv[1][0], bipred_pu->inter.mv[1][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[1]); @@ -1841,15 +1842,15 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - best_bipred_cost += info.state->lambda_sqrt * extra_bits; + best_bipred_cost += info->state->lambda_sqrt * extra_bits; if (best_bipred_cost < MAX_DOUBLE) { // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { int cu_mv_cand = select_mv_cand( - info.state, - info.mv_cand, + info->state, + info->mv_cand, bipred_pu->inter.mv[reflist][0], bipred_pu->inter.mv[reflist][1], NULL); @@ -1863,45 +1864,10 @@ static void search_pu_inter(encoder_state_t * const state, } // TODO: this probably should have a separate command line option - if (cfg->rdo == 3) search_pu_inter_bipred(&info, depth, lcu, &amvp[2]); + if (cfg->rdo == 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); kvz_sort_keys_by_cost(&amvp[2]); } - - cu_info_t* best_inter_pu = NULL; - - for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { - - int best_key = amvp[mv_dir - 1].keys[0]; - - if (amvp[mv_dir - 1].size > 0 && - amvp[mv_dir - 1].cost[best_key] < *inter_cost) { - - best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; - *inter_cost = amvp[mv_dir - 1].cost[best_key]; - *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; - } - } - - // Compare best merge cost to amvp cost - int best_merge_key = merge.keys[0]; - - if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { - - best_inter_pu = &merge.unit[best_merge_key]; - *inter_cost = merge.cost[best_merge_key]; - *inter_bitcost = 0; // TODO: Check this - } - - *cur_pu = *best_inter_pu; - - if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { - assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); - } - - if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { - assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); - } } /** @@ -1973,13 +1939,69 @@ void kvz_search_cu_inter(encoder_state_t * const state, double *inter_cost, uint32_t *inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + search_pu_inter(state, x, y, depth, SIZE_2Nx2N, 0, lcu, - inter_cost, - inter_bitcost); + amvp, + &merge, + &info); + // Early Skip CU decision + if (merge.size == 1 && merge.unit[0].skipped) { + *inter_cost = merge.cost[0]; + *inter_bitcost = merge.bits[0]; + return; + } + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this + } + + if (*inter_cost == MAX_DOUBLE) { + // Could not find any motion vector. + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + return; + } + + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + *cur_pu = *best_inter_pu; + // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, @@ -1988,6 +2010,14 @@ void kvz_search_cu_inter(encoder_state_t * const state, inter_cost, inter_bitcost); } + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } } @@ -2014,6 +2044,16 @@ void kvz_search_cu_smp(encoder_state_t * const state, double *inter_cost, uint32_t *inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + const int num_pu = kvz_part_mode_num_parts[part_mode]; const int width = LCU_WIDTH >> depth; const int y_local = SUB_SCU(y); @@ -2031,19 +2071,47 @@ void kvz_search_cu_smp(encoder_state_t * const state, double cost = MAX_DOUBLE; uint32_t bitcost = MAX_INT; - search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); + search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + cost = amvp[mv_dir - 1].cost[best_key]; + bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + cost = merge.cost[best_merge_key]; + bitcost = 0; // TODO: Check this + } if (cost == MAX_DOUBLE) { // Could not find any motion vector. - *inter_cost = MAX_DOUBLE; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; return; } - *inter_cost += cost; + *inter_cost += cost; *inter_bitcost += bitcost; cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); + *cur_pu = *best_inter_pu; + for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); @@ -2051,6 +2119,14 @@ void kvz_search_cu_smp(encoder_state_t * const state, scu->inter = cur_pu->inter; } } + + if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } } // Count bits spent for coding the partition mode. From 7f7112cc5762c145171f30ec9d24a52c8ab30387 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 7 Dec 2021 22:04:41 +0200 Subject: [PATCH 28/85] Use up-to-date value of mv dir for bit cost calculations --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 76fb8d54..5112c0b7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1371,7 +1371,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); - best_bits += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + best_bits += ref_list + mv_ref_coded; // Update best unipreds for biprediction bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); From 706d718d5d38f8b9a408bf726f1aa0daf09f6684 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 00:49:19 +0200 Subject: [PATCH 29/85] Perform FME for n best PUs from L0 and L1. --- src/search_inter.c | 74 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 5112c0b7..50548b1f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1347,11 +1347,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, &best_cost, &best_bits, &best_mv); break; } - - if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { - search_frac(info, &best_cost, &best_bits, &best_mv); - - } } if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { @@ -1768,6 +1763,75 @@ static void search_pu_inter(encoder_state_t * const state, &amvp[1].unit[best_keys[1]] }; + // Fractional-pixel motion estimation. + // Refine the best PUs so far from both lists, if available. + for (int list = 0; list < 2; ++list) { + + // TODO: make configurable + int n_best = MIN(1, amvp[list].size); + if (cfg->fme_level > 0) { + + for (int i = 0; i < n_best; ++i) { + + int key = amvp[list].keys[i]; + cu_info_t *unipred_pu = &amvp[list].unit[key]; + + // Find the reference picture + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int LX_idx = unipred_pu->inter.mv_ref[list]; + info->ref_idx = ref_LX[list][LX_idx]; + info->ref = ref->images[info->ref_idx]; + + kvz_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + unipred_pu, + lcu, + list); + + double *cost = &amvp[list].cost[key]; + + double frac_cost = MAX_DOUBLE; + uint32_t frac_bits = MAX_INT; + vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; + + search_frac(info, &frac_cost, &frac_bits, &frac_mv); + + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL); + frac_bits += list + mv_ref_coded; + + bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y); + if (valid_mv) { + + unipred_pu->inter.mv[list][0] = frac_mv.x; + unipred_pu->inter.mv[list][1] = frac_mv.y; + CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); + + amvp[list].cost[key] = frac_cost; + amvp[list].bits[key] = frac_bits; + } + } + + // Invalidate PUs with SAD-based costs. (FME not performed). + // TODO: Recalculate SAD costs with SATD for further processing. + for (int i = n_best; i < amvp[list].size; ++i) { + int key = amvp[list].keys[i]; + amvp[list].cost[key] = MAX_DOUBLE; + } + } + + // Costs are now, SATD-based. Omit PUs with SAD-based costs. + // TODO: Recalculate SAD costs with SATD for further processing. + kvz_sort_keys_by_cost(&amvp[list]); + amvp[list].size = n_best; + } + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred From 1af90b194efc21e9e35e09d8c24fbbbda602c4b1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 15:27:05 +0200 Subject: [PATCH 30/85] Add missing bits to RD costs. --- src/search_inter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 50548b1f..16da0168 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1366,7 +1366,9 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); - best_bits += ref_list + mv_ref_coded; + const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing + best_cost += extra_bits * info->state->lambda_sqrt; + best_bits += extra_bits; // Update best unipreds for biprediction bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); @@ -1804,7 +1806,9 @@ static void search_pu_inter(encoder_state_t * const state, uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL); - frac_bits += list + mv_ref_coded; + const int extra_bits = list + mv_ref_coded; // TODO: check if mv_dir bits are missing + frac_cost += extra_bits * info->state->lambda_sqrt; + frac_bits += extra_bits; bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y); if (valid_mv) { From ae498553c0f3f825519924584661812b6b1abf97 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 15:53:31 +0200 Subject: [PATCH 31/85] Add define MAX_UNIT_STATS_MAP_SIZE. Add assertions to inter search. --- src/search.h | 9 +++++---- src/search_inter.c | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/search.h b/src/search.h index de34755b..e4b299c3 100644 --- a/src/search.h +++ b/src/search.h @@ -44,6 +44,7 @@ #include "image.h" #include "constraint.h" +#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS) /** * \brief Data collected during search processes. @@ -56,10 +57,10 @@ */ typedef struct unit_stats_map_t { - cu_info_t unit[MAX_REF_PIC_COUNT]; //!< list of searched units - double cost[MAX_REF_PIC_COUNT]; //!< list of matching RD costs - uint32_t bits[MAX_REF_PIC_COUNT]; //!< list of matching bit costs - int8_t keys[MAX_REF_PIC_COUNT]; //!< list of keys (indices) to elements in the other arrays + cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units + double cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs + uint32_t bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs + int8_t keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays int size; //!< number of active elements in the lists } unit_stats_map_t; diff --git a/src/search_inter.c b/src/search_inter.c index 16da0168..9f317021 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1681,6 +1681,7 @@ static void search_pu_inter(encoder_state_t * const state, merge->size++; } + assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(merge); // Try early skip decision on just one merge candidate if available @@ -1735,7 +1736,7 @@ static void search_pu_inter(encoder_state_t * const state, amvp[2].size = 0; for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { - for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { + for (int i = 0; i < state->frame->ref->used_size; ++i) { amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[mv_dir - 1].keys[i] = i; amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; @@ -1749,6 +1750,8 @@ static void search_pu_inter(encoder_state_t * const state, search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); } + assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); + assert(amvp[1].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[0]); kvz_sort_keys_by_cost(&amvp[1]); @@ -1934,6 +1937,7 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: this probably should have a separate command line option if (cfg->rdo == 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); + assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); } } From 49935710a8808fb790e9286fe82775294b6b9cb2 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 16:35:47 +0200 Subject: [PATCH 32/85] Only one ME per reference picture (same ref in L0 and L1) --- src/search_inter.c | 294 +++++++++++++++++++++++---------------------- 1 file changed, 149 insertions(+), 145 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 9f317021..4dbe2db9 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1227,170 +1227,174 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Must find at least one reference picture assert(ref_list_active[0] || ref_list_active[1]); - for (int ref_list = 0; ref_list < 2; ++ref_list) { - if (ref_list_active[ref_list]) { + // Does not matter which list is used, if in both. + int ref_list = ref_list_active[0] ? 0 : 1; + int LX_idx = ref_list_idx[ref_list]; - int LX_idx = ref_list_idx[ref_list]; + // Get MV candidates + cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; - // Get MV candidates - cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; + kvz_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + cur_cu, + lcu, + ref_list); - kvz_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - cur_cu, - lcu, - ref_list); + vector2d_t best_mv = { 0, 0 }; - vector2d_t best_mv = { 0, 0 }; - - // Take starting point for MV search from previous frame. - // When temporal motion vector candidates are added, there is probably - // no point to this anymore, but for now it helps. - const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); - const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); - const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; - const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); - if (ref_cu->type == CU_INTER) { - vector2d_t mv_previous = { 0, 0 }; - if (ref_cu->inter.mv_dir & 1) { - mv_previous.x = ref_cu->inter.mv[0][0]; - mv_previous.y = ref_cu->inter.mv[0][1]; - } else { - mv_previous.x = ref_cu->inter.mv[1][0]; - mv_previous.y = ref_cu->inter.mv[1][1]; - } - // Apply mv scaling if neighbor poc is available - if (info->state->frame->ref_LX_size[ref_list] > 0) { - // When there are reference pictures from the future (POC > current POC) - // in L0 or L1, the primary list for the colocated PU is the inverse of - // collocated_from_l0_flag. Otherwise it is equal to reflist. - // - // Kvazaar always sets collocated_from_l0_flag so the list is L1 when - // there are future references. - int col_list = ref_list; - for (int i = 0; i < info->state->frame->ref->used_size; i++) { - if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { - col_list = 1; - break; - } - } - if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { - // Use the other list if the colocated PU does not have a MV for the - // primary list. - col_list = 1 - col_list; - } - - uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; - // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument - apply_mv_scaling( - info->state->frame->poc, - info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], - info->state->frame->ref->pocs[neighbor_poc_index], - info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ - info->state->frame->ref->ref_LXs[neighbor_poc_index] - [col_list] - [ref_cu->inter.mv_ref[col_list]] - ], - &mv_previous - ); - } - - // Check if the mv is valid after scaling - if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - best_mv = mv_previous; + // Take starting point for MV search from previous frame. + // When temporal motion vector candidates are added, there is probably + // no point to this anymore, but for now it helps. + const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); + const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); + const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; + const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); + if (ref_cu->type == CU_INTER) { + vector2d_t mv_previous = { 0, 0 }; + if (ref_cu->inter.mv_dir & 1) { + mv_previous.x = ref_cu->inter.mv[0][0]; + mv_previous.y = ref_cu->inter.mv[0][1]; + } else { + mv_previous.x = ref_cu->inter.mv[1][0]; + mv_previous.y = ref_cu->inter.mv[1][1]; + } + // Apply mv scaling if neighbor poc is available + if (info->state->frame->ref_LX_size[ref_list] > 0) { + // When there are reference pictures from the future (POC > current POC) + // in L0 or L1, the primary list for the colocated PU is the inverse of + // collocated_from_l0_flag. Otherwise it is equal to reflist. + // + // Kvazaar always sets collocated_from_l0_flag so the list is L1 when + // there are future references. + int col_list = ref_list; + for (int i = 0; i < info->state->frame->ref->used_size; i++) { + if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { + col_list = 1; + break; } } - - int search_range = 32; - switch (cfg->ime_algorithm) { - case KVZ_IME_FULL64: search_range = 64; break; - case KVZ_IME_FULL32: search_range = 32; break; - case KVZ_IME_FULL16: search_range = 16; break; - case KVZ_IME_FULL8: search_range = 8; break; - default: break; + if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { + // Use the other list if the colocated PU does not have a MV for the + // primary list. + col_list = 1 - col_list; } - double best_cost = MAX_DOUBLE; - uint32_t best_bits = MAX_INT; + uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; + // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument + apply_mv_scaling( + info->state->frame->poc, + info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], + info->state->frame->ref->pocs[neighbor_poc_index], + info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ + info->state->frame->ref->ref_LXs[neighbor_poc_index] + [col_list] + [ref_cu->inter.mv_ref[col_list]] + ], + &mv_previous + ); + } - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); - bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + // Check if the mv is valid after scaling + if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { + best_mv = mv_previous; + } + } + + int search_range = 32; + switch (cfg->ime_algorithm) { + case KVZ_IME_FULL64: search_range = 64; break; + case KVZ_IME_FULL32: search_range = 32; break; + case KVZ_IME_FULL16: search_range = 16; break; + case KVZ_IME_FULL8: search_range = 8; break; + default: break; + } + + double best_cost = MAX_DOUBLE; + uint32_t best_bits = MAX_INT; + + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); - if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); - break; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_DIA: - diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; - default: - hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; - } - } + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + } + } - if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { - // Recalculate inter cost with SATD. - best_cost = kvz_image_calc_satd( - info->state->tile->frame->source, - info->ref, - info->origin.x, - info->origin.y, - info->state->tile->offset_x + info->origin.x + (best_mv.x >> 2), - info->state->tile->offset_y + info->origin.y + (best_mv.y >> 2), - info->width, - info->height); - best_cost += best_bits * info->state->lambda_sqrt; - } + if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + best_cost = kvz_image_calc_satd( + info->state->tile->frame->source, + info->ref, + info->origin.x, + info->origin.y, + info->state->tile->offset_x + info->origin.x + (best_mv.x >> 2), + info->state->tile->offset_y + info->origin.y + (best_mv.y >> 2), + info->width, + info->height); + best_cost += best_bits * info->state->lambda_sqrt; + } - // Only check when candidates are different - uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); - const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing - best_cost += extra_bits * info->state->lambda_sqrt; - best_bits += extra_bits; + double LX_cost[2] = { best_cost, best_cost }; + double LX_bits[2] = { best_bits, best_bits }; - // Update best unipreds for biprediction - bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); - if (valid_mv && best_cost < MAX_DOUBLE) { + // Compute costs and add entries for both lists, if necessary + for (; ref_list_active[ref_list] && ref_list < 2; ++ref_list) { - // Map reference index to L0/L1 pictures - unit_stats_map_t *cur_map = &amvp[ref_list]; - int entry = cur_map->size; - cu_info_t *unipred_pu = &cur_map->unit[entry]; - unipred_pu->type = CU_INTER; - unipred_pu->merged = false; - unipred_pu->skipped = false; - unipred_pu->inter.mv_dir = ref_list + 1; - unipred_pu->inter.mv_ref[ref_list] = LX_idx; - unipred_pu->inter.mv[ref_list][0] = (int16_t)best_mv.x; - unipred_pu->inter.mv[ref_list][1] = (int16_t)best_mv.y; - CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); + LX_idx = ref_list_idx[ref_list]; + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing + LX_cost[ref_list] += extra_bits * info->state->lambda_sqrt; + LX_bits[ref_list] += extra_bits; - cur_map->cost[entry] = best_cost; - cur_map->bits[entry] = best_bits; - cur_map->size++; - } + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { + + // Map reference index to L0/L1 pictures + unit_stats_map_t *cur_map = &amvp[ref_list]; + int entry = cur_map->size; + cu_info_t *unipred_pu = &cur_map->unit[entry]; + unipred_pu->type = CU_INTER; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = ref_list + 1; + unipred_pu->inter.mv_ref[ref_list] = LX_idx; + unipred_pu->inter.mv[ref_list][0] = (int16_t)best_mv.x; + unipred_pu->inter.mv[ref_list][1] = (int16_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); + + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; + cur_map->size++; } } } From c411e659775405817e0a9857bfb715339ef28458 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 18:31:09 +0200 Subject: [PATCH 33/85] Prevent FME and bipred from the same reference picture if present in L0 and L1 --- src/search_inter.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 4dbe2db9..cca383b4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1760,6 +1760,38 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[1]); int best_keys[2] = { amvp[0].keys[0], amvp[1].keys[0] }; + + cu_info_t *best_unipred[2] = { + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] + }; + + // Prevent using the same ref picture with both lists. + // TODO: allow searching two MVs from the same reference picture. + if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) { + + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int L0_idx = best_unipred[0]->inter.mv_ref[0]; + int L1_idx = best_unipred[1]->inter.mv_ref[1]; + + int L0_ref_idx = ref_LX[0][L0_idx]; + int L1_ref_idx = ref_LX[1][L1_idx]; + + if (L0_ref_idx == L1_ref_idx) { + // Invalidate the other based the list that has the 2nd best PU + double L0_2nd_cost = amvp[0].size > 1 ? amvp[0].cost[amvp[0].keys[1]] : MAX_DOUBLE; + double L1_2nd_cost = amvp[1].size > 1 ? amvp[1].cost[amvp[1].keys[1]] : MAX_DOUBLE; + int list = (L0_2nd_cost <= L1_2nd_cost) ? 1 : 0; + amvp[list].cost[best_keys[list]] = MAX_DOUBLE; + kvz_sort_keys_by_cost(&amvp[list]); + amvp[list].size--; + best_keys[list] = amvp[list].keys[0]; + best_unipred[list] = &amvp[list].unit[best_keys[list]]; + } + } + double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_keys[0]]; @@ -1767,11 +1799,6 @@ static void search_pu_inter(encoder_state_t * const state, int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; - cu_info_t *best_unipred[2] = { - &amvp[0].unit[best_keys[0]], - &amvp[1].unit[best_keys[1]] - }; - // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { From f17a500b779f8c87d39410cc42279542380ae7af Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 21:06:12 +0200 Subject: [PATCH 34/85] Get rid of warnings. (Unused variables, suggested parentheses) --- src/search_inter.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index cca383b4..15606d52 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1658,8 +1658,8 @@ static void search_pu_inter(encoder_state_t * const state, // Don't add duplicates to list bool active_L0 = cur_pu->inter.mv_dir & 1; bool active_L1 = cur_pu->inter.mv_dir & 2; - if (active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || - active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || + if ((active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) || + (active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])) || is_duplicate) { continue; @@ -1770,7 +1770,6 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: allow searching two MVs from the same reference picture. if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) { - const image_list_t *const ref = info->state->frame->ref; uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; int L0_idx = best_unipred[0]->inter.mv_ref[0]; @@ -1792,13 +1791,6 @@ static void search_pu_inter(encoder_state_t * const state, } } - double best_cost_L0 = MAX_DOUBLE; - double best_cost_L1 = MAX_DOUBLE; - if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_keys[0]]; - if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_keys[1]]; - int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; - int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; - // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { @@ -1830,8 +1822,6 @@ static void search_pu_inter(encoder_state_t * const state, lcu, list); - double *cost = &amvp[list].cost[key]; - double frac_cost = MAX_DOUBLE; uint32_t frac_bits = MAX_INT; vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; From e87b12dec17d6b0135f0ce9fcbad47669ffeceb9 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 21:09:16 +0200 Subject: [PATCH 35/85] Move mv_cand initialization to better place --- src/search_inter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 15606d52..659a112c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1602,6 +1602,9 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->depth = depth; cur_pu->qp = state->qp; + // Default to candidate 0 + CU_SET_MV_CAND(cur_pu, 0, 0); + CU_SET_MV_CAND(cur_pu, 1, 0); info->state = state; info->pic = frame->source; @@ -1622,10 +1625,6 @@ static void search_pu_inter(encoder_state_t * const state, lcu ); - // Default to candidate 0 - CU_SET_MV_CAND(cur_pu, 0, 0); - CU_SET_MV_CAND(cur_pu, 1, 0); - // Merge Analysis starts here merge->size = 0; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { From bb1f2a0895d4489464fac4348ccb9c738d707652 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 21:13:25 +0200 Subject: [PATCH 36/85] Reorder condition to prevent indexing past the array --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 659a112c..6afa6c22 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1366,7 +1366,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, double LX_bits[2] = { best_bits, best_bits }; // Compute costs and add entries for both lists, if necessary - for (; ref_list_active[ref_list] && ref_list < 2; ++ref_list) { + for (; ref_list < 2 && ref_list_active[ref_list]; ++ref_list) { LX_idx = ref_list_idx[ref_list]; uint8_t mv_ref_coded = LX_idx; From a1a7036445c66571d1725c2d9c58917aff81da92 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 23:03:18 +0200 Subject: [PATCH 37/85] Fix indexing. Get rid of warning about jump depending on uninitialized value. --- src/search_inter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 6afa6c22..67a3166d 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1534,14 +1534,14 @@ static void search_pu_inter_bipred(inter_search_info_t *info, * * \return Does an identical candidate exist in list */ -static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, - inter_merge_cand_t * cand_to_add, - int8_t * added_idx_list, - int list_size) +static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, + inter_merge_cand_t *cand_to_add, + unit_stats_map_t *merge) { bool found = false; - for (int i = 0; i < list_size && !found; ++i) { - inter_merge_cand_t * list_cand = &all_cands[added_idx_list[i]]; + for (int i = 0; i < merge->size && !found; ++i) { + int key = merge->keys[i]; + inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx]; found = cand_to_add->dir == list_cand->dir && cand_to_add->ref[0] == list_cand->ref[0] && @@ -1606,6 +1606,8 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_pu, 0, 0); CU_SET_MV_CAND(cur_pu, 1, 0); + FILL(*info, 0); + info->state = state; info->pic = frame->source; info->origin.x = x; @@ -1649,9 +1651,7 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; - bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, - merge->keys, - merge->size); + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list From e45c6a9c68fa511a10a97f6a0ac13d59b264484f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 10 Dec 2021 00:02:26 +0200 Subject: [PATCH 38/85] Fix too few added keys in inter search stats. The function search_pu_inter_bipred may add more PUs than there are reference pictures. --- src/search_inter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 67a3166d..e670683c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1394,6 +1394,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, cur_map->cost[entry] = best_cost; cur_map->bits[entry] = best_bits; + cur_map->keys[entry] = entry; cur_map->size++; } } @@ -1520,6 +1521,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, amvp_bipred->cost[amvp_bipred->size] = cost; amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; + amvp_bipred->keys[amvp_bipred->size] = amvp_bipred->size; amvp_bipred->size++; } } @@ -1741,7 +1743,6 @@ static void search_pu_inter(encoder_state_t * const state, for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < state->frame->ref->used_size; ++i) { amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary - amvp[mv_dir - 1].keys[i] = i; amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; } } @@ -1950,6 +1951,7 @@ static void search_pu_inter(encoder_state_t * const state, amvp[2].cost[amvp[2].size] = best_bipred_cost; amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].keys[amvp[2].size] = amvp[2].size; amvp[2].size++; } } From 2424a976a408112e24f2eed1617d246025b15fab Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 10 Dec 2021 00:21:58 +0200 Subject: [PATCH 39/85] Prevent using uninitialized memory --- src/search_inter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index e670683c..59aa7342 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1759,7 +1759,10 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[0]); kvz_sort_keys_by_cost(&amvp[1]); - int best_keys[2] = { amvp[0].keys[0], amvp[1].keys[0] }; + int best_keys[2] = { + amvp[0].size > 0 ? amvp[0].keys[0] : 0, + amvp[1].size > 0 ? amvp[1].keys[0] : 0 + }; cu_info_t *best_unipred[2] = { &amvp[0].unit[best_keys[0]], From 51dd942778ef30288bcc270adc79bf5ba685f1a2 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 10 Dec 2021 00:32:08 +0200 Subject: [PATCH 40/85] Fix uninitialized fields of CU/PU infos. --- src/search_inter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 59aa7342..08594b9f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1383,6 +1383,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, unit_stats_map_t *cur_map = &amvp[ref_list]; int entry = cur_map->size; cu_info_t *unipred_pu = &cur_map->unit[entry]; + *unipred_pu = *cur_cu; unipred_pu->type = CU_INTER; unipred_pu->merged = false; unipred_pu->skipped = false; @@ -1440,6 +1441,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, } cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size]; + *bipred_pu = *LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); bipred_pu->inter.mv_dir = 3; @@ -1742,7 +1744,6 @@ static void search_pu_inter(encoder_state_t * const state, for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < state->frame->ref->used_size; ++i) { - amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; } } @@ -1871,6 +1872,7 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { cu_info_t *bipred_pu = &amvp[2].unit[0]; + *bipred_pu = *cur_pu; double best_bipred_cost = MAX_DOUBLE; // Try biprediction from valid acquired unipreds. From 6c50939af3261d1bbe449319509d12e26f6d728c Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 21 Mar 2018 10:46:30 +0200 Subject: [PATCH 41/85] Different roi-matrix for each frame Implement reading the roi-matrix for each frame from binary file. Extremely simple and breaks on any unhappy paths. # Conflicts: # src/cfg.c # src/cli.c # src/encoder.c # src/image.c # src/rate_control.c --- src/cfg.c | 13 +++++++++++++ src/cli.c | 1 + src/encmain.c | 29 ++++++++++++++++++++++++++++- src/encoder.c | 2 +- src/image.c | 7 +++++++ src/kvazaar.h | 10 ++++++++++ src/rate_control.c | 23 ++++++++++++++++++++++- 7 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/cfg.c b/src/cfg.c index 07c71a55..6a1fcf40 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -142,6 +142,8 @@ int kvz_config_init(kvz_config *cfg) cfg->roi.width = 0; cfg->roi.height = 0; cfg->roi.dqps = NULL; + + cfg->roi_file = NULL; cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -190,6 +192,7 @@ int kvz_config_destroy(kvz_config *cfg) { if (cfg) { FREE_POINTER(cfg->cqmfile); + FREE_POINTER(cfg->roi_file); FREE_POINTER(cfg->fast_coeff_table_fn); FREE_POINTER(cfg->tiles_width_split); FREE_POINTER(cfg->tiles_height_split); @@ -1296,6 +1299,16 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) fclose(f); } + else if OPT("roi-file") + { + char* roifile = strdup(value); + if (!roifile) { + fprintf(stderr, "Failed to allocate memory for roi file name.\n"); + return 0; + } + FREE_POINTER(cfg->roi_file); + cfg->roi_file = roifile; + } else if OPT("set-qp-in-cu") { cfg->set_qp_in_cu = (bool)atobool(value); } diff --git a/src/cli.c b/src/cli.c index 811537b3..4aa86794 100644 --- a/src/cli.c +++ b/src/cli.c @@ -141,6 +141,7 @@ static const struct option long_options[] = { { "force-level", required_argument, NULL, 0 }, { "high-tier", no_argument, NULL, 0 }, { "me-steps", required_argument, NULL, 0 }, + { "roi-file", required_argument, NULL, 0 }, { "fast-residual-cost", required_argument, NULL, 0 }, { "set-qp-in-cu", no_argument, NULL, 0 }, { "open-gop", no_argument, NULL, 0 }, diff --git a/src/encmain.c b/src/encmain.c index 5804c7f8..37f1c121 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -147,6 +147,7 @@ typedef struct { // Parameters passed from main thread to input thread. FILE* input; + FILE* roi_file; const kvz_api *api; const cmdline_opts_t *opts; const encoder_control_t *encoder; @@ -248,6 +249,21 @@ static void* input_read_thread(void* in_args) } } + if(args->roi_file) { + if (fread(&frame_in->roi, 4, 2, args->roi_file) != 2) { + fprintf(stderr, "Failed to read roi matrix size for frame: %d. Shutting down.\n", frames_read); + retval = RETVAL_FAILURE; + goto done; + } + const size_t roi_size = frame_in->roi.height*frame_in->roi.width; + frame_in->roi.roi_array = malloc(roi_size); + if(fread(frame_in->roi.roi_array, 1, roi_size, args->roi_file) != roi_size) { + fprintf(stderr, "Failed to read roi matrix for frame: %d. Shutting down.\n", frames_read); + retval = RETVAL_FAILURE; + goto done; + } + } + frames_read++; if (args->encoder->cfg.source_scan_type != 0) { @@ -427,6 +443,7 @@ int main(int argc, char *argv[]) FILE *input = NULL; //!< input file (YUV) FILE *output = NULL; //!< output file (HEVC NAL stream) FILE *recout = NULL; //!< reconstructed YUV output, --debug + FILE *roifile = NULL; clock_t start_time = clock(); clock_t encoding_start_cpu_time; KVZ_CLOCK_T encoding_start_real_time; @@ -493,6 +510,14 @@ int main(int argc, char *argv[]) goto exit_failure; } + if(opts->config->roi_file) { + roifile = fopen(opts->config->roi_file, "rb"); + if(roifile == NULL) { + fprintf(stderr, "Could not open roi file although it was required. Shutting down!\n"); + goto exit_failure; + } + } + #ifdef _WIN32 // Set stdin and stdout to binary for pipes. if (input == stdin) { @@ -566,9 +591,10 @@ int main(int argc, char *argv[]) // Give arguments via struct to the input thread input_handler_args in_args = { .available_input_slots = available_input_slots, - .filled_input_slots = filled_input_slots, + .filled_input_slots = filled_input_slots, .input = input, + .roi_file = roifile, .api = api, .opts = opts, .encoder = encoder, @@ -805,6 +831,7 @@ done: if (input) fclose(input); if (output) fclose(output); if (recout) fclose(recout); + if (roifile) fclose(roifile); CHECKPOINTS_FINALIZE(); diff --git a/src/encoder.c b/src/encoder.c index e582cc38..dd485e6a 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -416,7 +416,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) // for SMP and AMP partition units. encoder->tr_depth_inter = 0; - if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { + if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.roi_file || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { encoder->max_qp_delta_depth = 0; } else { encoder->max_qp_delta_depth = -1; diff --git a/src/image.c b/src/image.c index ddd58d47..c923e78f 100644 --- a/src/image.c +++ b/src/image.c @@ -100,6 +100,10 @@ kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_ im->interlacing = KVZ_INTERLACING_NONE; + im->roi.roi_array = NULL; + im->roi.width = 0; + im->roi.height = 0; + return im; } @@ -126,6 +130,7 @@ void kvz_image_free(kvz_picture *const im) kvz_image_free(im->base_image); } else { free(im->fulldata_buf); + if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array); } // Make sure freed data won't be used. @@ -186,6 +191,8 @@ kvz_picture *kvz_image_make_subimage(kvz_picture *const orig_image, im->pts = 0; im->dts = 0; + im->roi = orig_image->roi; + return im; } diff --git a/src/kvazaar.h b/src/kvazaar.h index f03ffa27..967a3c67 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -393,6 +393,8 @@ typedef struct kvz_config int8_t *dqps; } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */ + char *roi_file; + unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */ /** @@ -510,6 +512,14 @@ typedef struct kvz_picture { enum kvz_chroma_format chroma_format; int32_t ref_pocs[16]; + + struct + { + int width; + int height; + int8_t *roi_array; + } roi; + } kvz_picture; /** diff --git a/src/rate_control.c b/src/rate_control.c index 4978ae04..e5620fb0 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -1085,7 +1085,25 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, const encoder_control_t * const ctrl = state->encoder_control; lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y); - if (ctrl->cfg.roi.dqps != NULL) { + if (ctrl->cfg.roi.dqps != NULL || state->tile->frame->source->roi.roi_array) { + vector2d_t lcu_vec = { + pos.x + state->tile->lcu_offset_x, + pos.y + state->tile->lcu_offset_y + }; + vector2d_t roi = { + lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu, + lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu + }; + int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width; + int dqp = state->tile->frame->source->roi.roi_array[roi_index]; + if(dqp != 0) { + pos.x = 0; + } + state->qp = CLIP_TO_QP(state->frame->QP + dqp); + state->lambda = qp_to_lambda(state, state->qp); + state->lambda_sqrt = sqrt(state->frame->lambda); + } + else if (ctrl->cfg.roi.dqps != NULL) { vector2d_t lcu = { pos.x + state->tile->lcu_offset_x, pos.y + state->tile->lcu_offset_y @@ -1096,6 +1114,9 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, }; int roi_index = roi.x + roi.y * ctrl->cfg.roi.width; int dqp = ctrl->cfg.roi.dqps[roi_index]; + if (dqp != 0) { + pos.x = 0; + } state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lambda(state, state->qp); state->lambda_sqrt = sqrt(state->lambda); From 917d26f1bf86286523cb9f2545bda363c6c97724 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 6 Feb 2022 20:08:28 +0200 Subject: [PATCH 42/85] Unify delta QP / ROI map functionality. --- README.md | 19 +++- configure.ac | 4 +- doc/kvazaar.1 | 21 ++-- src/cfg.c | 88 ++++------------ src/cli.c | 19 +++- src/encmain.c | 25 ----- src/encode_coding_tree.c | 2 +- src/encoder.c | 125 +++-------------------- src/encoder.h | 2 +- src/encoder_state-bitstream.c | 4 +- src/encoderstate.c | 186 +++++++++++++++++++++++++++++++++- src/encoderstate.h | 6 +- src/filter.c | 2 +- src/kvazaar.h | 17 ++-- src/rate_control.c | 21 +--- 15 files changed, 285 insertions(+), 256 deletions(-) diff --git a/README.md b/README.md index 2daa0fae..5d36012e 100644 --- a/README.md +++ b/README.md @@ -156,11 +156,20 @@ Video structure: - frametile: Constrain within the tile. - frametilemargin: Constrain even more. --roi : Use a delta QP map for region of interest. - Reads an array of delta QP values from a text - file. The file format is: width and height of - the QP delta map followed by width*height delta - QP values in raster order. The map can be of any - size and will be scaled to the video size. + Reads an array of delta QP values from a file. + Text and binary files are supported and detected + from the file extension (.txt/.bin). If a known + extension is not found, the file is treated as + a text file. The file can include one or many + ROI frames each in the following format: + width and height of the QP delta map followed + by width * height delta QP values in raster + order. In binary format, width and height are + 32-bit integers whereas the delta QP values are + signed 8-bit values. The map can be of any size + and will be scaled to the video size. The file + reading will loop if end of the file is reached. + See roi.txt in the examples folder. --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26. in PPS and slice_qp_delta in slize header zero. --(no-)erp-aqp : Use adaptive QP for 360 degree video with diff --git a/configure.ac b/configure.ac index 832b584d..178a9b3d 100644 --- a/configure.ac +++ b/configure.ac @@ -22,8 +22,8 @@ AC_CONFIG_SRCDIR([src/encmain.c]) # - Increment when making new releases and major or minor was not changed since last release. # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html -ver_major=6 -ver_minor=6 +ver_major=7 +ver_minor=0 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS diff --git a/doc/kvazaar.1 b/doc/kvazaar.1 index 93def73f..c5883b84 100644 --- a/doc/kvazaar.1 +++ b/doc/kvazaar.1 @@ -1,4 +1,4 @@ -.TH KVAZAAR "1" "October 2021" "kvazaar v2.1.0" "User Commands" +.TH KVAZAAR "1" "February 2022" "kvazaar v2.1.0" "User Commands" .SH NAME kvazaar \- open source HEVC encoder .SH SYNOPSIS @@ -180,11 +180,20 @@ Constrain movement vectors. [none] .TP \fB\-\-roi Use a delta QP map for region of interest. -Reads an array of delta QP values from a text -file. The file format is: width and height of -the QP delta map followed by width*height delta -QP values in raster order. The map can be of any -size and will be scaled to the video size. +Reads an array of delta QP values from a file. +Text and binary files are supported and detected +from the file extension (.txt/.bin). If a known +extension is not found, the file is treated as +a text file. The file can include one or many +ROI frames each in the following format: +width and height of the QP delta map followed +by width * height delta QP values in raster +order. In binary format, width and height are +32\-bit integers whereas the delta QP values are +signed 8\-bit values. The map can be of any size +and will be scaled to the video size. The file +reading will loop if end of the file is reached. +See roi.txt in the examples folder. .TP \fB\-\-set\-qp\-in\-cu Set QP at CU level keeping pic_init_qp_minus26. diff --git a/src/cfg.c b/src/cfg.c index 6a1fcf40..69745a96 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -139,11 +139,9 @@ int kvz_config_init(kvz_config *cfg) cfg->gop_lp_definition.t = 1; cfg->open_gop = true; - cfg->roi.width = 0; - cfg->roi.height = 0; - cfg->roi.dqps = NULL; - - cfg->roi_file = NULL; + cfg->roi.file_path = NULL; + cfg->roi.format = KVZ_ROI_TXT; + cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -192,12 +190,11 @@ int kvz_config_destroy(kvz_config *cfg) { if (cfg) { FREE_POINTER(cfg->cqmfile); - FREE_POINTER(cfg->roi_file); + FREE_POINTER(cfg->roi.file_path); FREE_POINTER(cfg->fast_coeff_table_fn); FREE_POINTER(cfg->tiles_width_split); FREE_POINTER(cfg->tiles_height_split); FREE_POINTER(cfg->slice_addresses_in_ts); - FREE_POINTER(cfg->roi.dqps); FREE_POINTER(cfg->optional_key); FREE_POINTER(cfg->fastrd_learning_outdir_fn); } @@ -1244,70 +1241,29 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) } else if OPT("implicit-rdpcm") cfg->implicit_rdpcm = (bool)atobool(value); + else if OPT("roi") { - // The ROI description is as follows: - // First number is width, second number is height, - // then follows width * height number of dqp values. - FILE* f = fopen(value, "rb"); - if (!f) { - fprintf(stderr, "Could not open ROI file.\n"); + static enum kvz_roi_format const formats[] = { KVZ_ROI_TXT, KVZ_ROI_BIN }; + static const char * const format_names[] = { "txt", "bin", NULL }; + + char *roi_file = strdup(value); + if (!roi_file) { + fprintf(stderr, "Failed to allocate memory for ROI file name.\n"); return 0; } + FREE_POINTER(cfg->roi.file_path); + cfg->roi.file_path = roi_file; - int width = 0; - int height = 0; - if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) { - fprintf(stderr, "Failed to read ROI size.\n"); - fclose(f); - return 0; + // Get file extension or the substring after the last dot + char *maybe_extension = strrchr(cfg->roi.file_path, '.'); + if (!maybe_extension) { + cfg->roi.format = KVZ_ROI_TXT; + } else { + maybe_extension++; + int8_t format; + bool unknown_format = !parse_enum(maybe_extension, format_names, &format); + cfg->roi.format = unknown_format ? KVZ_ROI_TXT : formats[format]; } - - if (width <= 0 || height <= 0) { - fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height); - fclose(f); - return 0; - } - - if (width > 10000 || height > 10000) { - fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); - fclose(f); - return 0; - } - - const unsigned size = width * height; - int8_t *dqp_array = calloc((size_t)size, sizeof(cfg->roi.dqps[0])); - if (!dqp_array) { - fprintf(stderr, "Failed to allocate memory for ROI table.\n"); - fclose(f); - return 0; - } - - FREE_POINTER(cfg->roi.dqps); - cfg->roi.dqps = dqp_array; - cfg->roi.width = width; - cfg->roi.height = height; - - for (int i = 0; i < size; ++i) { - int number; // Need a pointer to int for fscanf - if (fscanf(f, "%d", &number) != 1) { - fprintf(stderr, "Reading ROI file failed.\n"); - fclose(f); - return 0; - } - dqp_array[i] = CLIP(-51, 51, number); - } - - fclose(f); - } - else if OPT("roi-file") - { - char* roifile = strdup(value); - if (!roifile) { - fprintf(stderr, "Failed to allocate memory for roi file name.\n"); - return 0; - } - FREE_POINTER(cfg->roi_file); - cfg->roi_file = roifile; } else if OPT("set-qp-in-cu") { cfg->set_qp_in_cu = (bool)atobool(value); diff --git a/src/cli.c b/src/cli.c index 4aa86794..b32c10c0 100644 --- a/src/cli.c +++ b/src/cli.c @@ -501,11 +501,20 @@ void print_help(void) " - frametile: Constrain within the tile.\n" " - frametilemargin: Constrain even more.\n" " --roi : Use a delta QP map for region of interest.\n" - " Reads an array of delta QP values from a text\n" - " file. The file format is: width and height of\n" - " the QP delta map followed by width*height delta\n" - " QP values in raster order. The map can be of any\n" - " size and will be scaled to the video size.\n" + " Reads an array of delta QP values from a file.\n" + " Text and binary files are supported and detected\n" + " from the file extension (.txt/.bin). If a known\n" + " extension is not found, the file is treated as\n" + " a text file. The file can include one or many\n" + " ROI frames each in the following format:\n" + " width and height of the QP delta map followed\n" + " by width * height delta QP values in raster\n" + " order. In binary format, width and height are\n" + " 32-bit integers whereas the delta QP values are\n" + " signed 8-bit values. The map can be of any size\n" + " and will be scaled to the video size. The file\n" + " reading will loop if end of the file is reached.\n" + " See roi.txt in the examples folder.\n" " --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26.\n" " in PPS and slice_qp_delta in slize header zero.\n" " --(no-)erp-aqp : Use adaptive QP for 360 degree video with\n" diff --git a/src/encmain.c b/src/encmain.c index 37f1c121..6d172442 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -147,7 +147,6 @@ typedef struct { // Parameters passed from main thread to input thread. FILE* input; - FILE* roi_file; const kvz_api *api; const cmdline_opts_t *opts; const encoder_control_t *encoder; @@ -249,21 +248,6 @@ static void* input_read_thread(void* in_args) } } - if(args->roi_file) { - if (fread(&frame_in->roi, 4, 2, args->roi_file) != 2) { - fprintf(stderr, "Failed to read roi matrix size for frame: %d. Shutting down.\n", frames_read); - retval = RETVAL_FAILURE; - goto done; - } - const size_t roi_size = frame_in->roi.height*frame_in->roi.width; - frame_in->roi.roi_array = malloc(roi_size); - if(fread(frame_in->roi.roi_array, 1, roi_size, args->roi_file) != roi_size) { - fprintf(stderr, "Failed to read roi matrix for frame: %d. Shutting down.\n", frames_read); - retval = RETVAL_FAILURE; - goto done; - } - } - frames_read++; if (args->encoder->cfg.source_scan_type != 0) { @@ -510,14 +494,6 @@ int main(int argc, char *argv[]) goto exit_failure; } - if(opts->config->roi_file) { - roifile = fopen(opts->config->roi_file, "rb"); - if(roifile == NULL) { - fprintf(stderr, "Could not open roi file although it was required. Shutting down!\n"); - goto exit_failure; - } - } - #ifdef _WIN32 // Set stdin and stdout to binary for pipes. if (input == stdin) { @@ -594,7 +570,6 @@ int main(int argc, char *argv[]) .filled_input_slots = filled_input_slots, .input = input, - .roi_file = roifile, .api = api, .opts = opts, .encoder = encoder, diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 90df4dd1..03b04943 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -758,7 +758,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu; bool border = border_x || border_y; /*!< are we in any border CU */ - if (depth <= ctrl->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { state->must_code_qp_delta = true; } diff --git a/src/encoder.c b/src/encoder.c index dd485e6a..67751f56 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -32,9 +32,6 @@ #include "encoder.h" -// This define is required for M_PI on Windows. -#define _USE_MATH_DEFINES -#include #include #include @@ -45,14 +42,6 @@ #include "kvz_math.h" #include "fast_coeff_cost.h" -/** - * \brief Strength of QP adjustments when using adaptive QP for 360 video. - * - * Determined empirically. - */ -static const double ERP_AQP_STRENGTH = 3.0; - - static int encoder_control_init_gop_layer_weights(encoder_control_t * const); static unsigned cfg_num_threads(void) @@ -136,82 +125,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder) } -/** - * \brief Return weight for 360 degree ERP video - * - * Returns the scaling factor of area from equirectangular projection to - * spherical surface. - * - * \param y y-coordinate of the pixel - * \param h height of the picture - */ -static double ws_weight(int y, int h) -{ - return cos((y - 0.5 * h + 0.5) * (M_PI / h)); -} - - - -/** - * \brief Update ROI QPs for 360 video with equirectangular projection. - * - * Writes updated ROI parameters to encoder->cfg.roi. - * - * \param encoder encoder control - * \param orig_roi original delta QPs or NULL - * \param orig_width width of orig_roi - * \param orig_height height of orig_roi - */ -static void init_erp_aqp_roi(encoder_control_t* encoder, - int8_t *orig_roi, - int32_t orig_width, - int32_t orig_height) -{ - // Update ROI with WS-PSNR delta QPs. - int height = encoder->in.height_in_lcu; - int width = orig_roi ? orig_width : 1; - - int frame_height = encoder->in.real_height; - - encoder->cfg.roi.width = width; - encoder->cfg.roi.height = height; - encoder->cfg.roi.dqps = calloc(width * height, sizeof(orig_roi[0])); - - double total_weight = 0.0; - for (int y = 0; y < frame_height; y++) { - total_weight += ws_weight(y, frame_height); - } - - for (int y_lcu = 0; y_lcu < height; y_lcu++) { - int y_orig = LCU_WIDTH * y_lcu; - int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); - - double lcu_weight = 0.0; - for (int y = y_orig; y < y_orig + lcu_height; y++) { - lcu_weight += ws_weight(y, frame_height); - } - // Normalize. - lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); - - int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); - - if (orig_roi) { - // If a ROI array already exists, we copy the existing values to the - // new array while adding qp_delta to each. - int y_roi = y_lcu * orig_height / height; - for (int x = 0; x < width; x++) { - encoder->cfg.roi.dqps[x + y_lcu * width] = - CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta); - } - - } else { - // Otherwise, simply write qp_delta to the ROI array. - encoder->cfg.roi.dqps[y_lcu] = qp_delta; - } - } -} - - /** * \brief Allocate and initialize an encoder control structure. * @@ -353,6 +266,16 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) encoder->scaling_list.use_default_list = 1; } + // ROI / delta QP + if (cfg->roi.file_path) { + const char *mode[2] = { "r", "rb" }; + encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]); + if (!encoder->roi_file) { + fprintf(stderr, "Could not open ROI file.\n"); + goto init_failed; + } + } + if (cfg->fast_coeff_table_fn) { FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb"); if (fast_coeff_table_f == NULL) { @@ -396,32 +319,10 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) goto init_failed; } - if (cfg->erp_aqp) { - init_erp_aqp_roi(encoder, - cfg->roi.dqps, - cfg->roi.width, - cfg->roi.height); - - } else if (cfg->roi.dqps) { - // Copy delta QP array for ROI coding. - const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height; - encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0])); - memcpy(encoder->cfg.roi.dqps, - cfg->roi.dqps, - roi_size * sizeof(*cfg->roi.dqps)); - - } - // NOTE: When tr_depth_inter is equal to 0, the transform is still split // for SMP and AMP partition units. encoder->tr_depth_inter = 0; - if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.roi_file || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { - encoder->max_qp_delta_depth = 0; - } else { - encoder->max_qp_delta_depth = -1; - } - //Tiles encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 || encoder->cfg.tiles_height_count > 1; @@ -724,7 +625,7 @@ void kvz_encoder_control_free(encoder_control_t *const encoder) FREE_POINTER(encoder->tiles_tile_id); - FREE_POINTER(encoder->cfg.roi.dqps); + FREE_POINTER(encoder->cfg.roi.file_path); FREE_POINTER(encoder->cfg.optional_key); kvz_scalinglist_destroy(&encoder->scaling_list); @@ -734,6 +635,10 @@ void kvz_encoder_control_free(encoder_control_t *const encoder) kvz_close_rdcost_outfiles(); + if (encoder->roi_file) { + fclose(encoder->roi_file); + } + free(encoder); } diff --git a/src/encoder.h b/src/encoder.h index 89f6b3a2..24a93f86 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -130,7 +130,7 @@ typedef struct encoder_control_t //! Picture weights when GOP is used. double gop_layer_weights[MAX_GOP_LAYERS]; - int8_t max_qp_delta_depth; + FILE *roi_file; int tr_depth_inter; diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 707103ad..05b934d4 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -503,10 +503,10 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream, WRITE_U(stream, 0, 1, "constrained_intra_pred_flag"); WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag"); - if (encoder->max_qp_delta_depth >= 0) { + if (state->frame->max_qp_delta_depth >= 0) { // Use separate QP for each LCU when rate control is enabled. WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag"); - WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth"); + WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth"); } else { WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag"); } diff --git a/src/encoderstate.c b/src/encoderstate.c index 6bcce76b..6e3cf0b4 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -32,6 +32,9 @@ #include "encoderstate.h" + // This define is required for M_PI on Windows. +#define _USE_MATH_DEFINES +#include #include #include #include @@ -51,6 +54,13 @@ #include "strategies/strategies-picture.h" +/** + * \brief Strength of QP adjustments when using adaptive QP for 360 video. + * + * Determined empirically. + */ +static const double ERP_AQP_STRENGTH = 3.0; + int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) { int i; @@ -570,7 +580,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y); const int cu_width = LCU_WIDTH >> depth; - if (depth <= state->encoder_control->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { *prev_qp = -1; } @@ -650,7 +660,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search); - if (encoder->max_qp_delta_depth >= 0) { + if (state->frame->max_qp_delta_depth >= 0) { int last_qp = state->last_qp; int prev_qp = -1; set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp); @@ -1252,6 +1262,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64) } } + +/** + * \brief Return weight for 360 degree ERP video + * + * Returns the scaling factor of area from equirectangular projection to + * spherical surface. + * + * \param y y-coordinate of the pixel + * \param h height of the picture + */ +static double ws_weight(int y, int h) +{ + return cos((y - 0.5 * h + 0.5) * (M_PI / h)); +} + + +/** + * \brief Update ROI QPs for 360 video with equirectangular projection. + * + * Updates the ROI parameters in frame->roi. + * + * \param encoder encoder control + * \param frame frame that will have the ROI map + */ +static void init_erp_aqp_roi(const encoder_control_t *encoder, kvz_picture *frame) +{ + int8_t *orig_roi = frame->roi.roi_array; + int32_t orig_width = frame->roi.width; + int32_t orig_height = frame->roi.height; + + // Update ROI with WS-PSNR delta QPs. + int new_height = encoder->in.height_in_lcu; + int new_width = orig_roi ? orig_width : 1; + int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0])); + + int frame_height = encoder->in.real_height; + + double total_weight = 0.0; + for (int y = 0; y < frame_height; y++) { + total_weight += ws_weight(y, frame_height); + } + + for (int y_lcu = 0; y_lcu < new_height; y_lcu++) { + int y_orig = LCU_WIDTH * y_lcu; + int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); + + double lcu_weight = 0.0; + for (int y = y_orig; y < y_orig + lcu_height; y++) { + lcu_weight += ws_weight(y, frame_height); + } + // Normalize. + lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); + + int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); + + if (orig_roi) { + // If a ROI array already exists, we copy the existing values to the + // new array while adding qp_delta to each. + int y_roi = y_lcu * orig_height / new_height; + for (int x = 0; x < new_width; x++) { + new_array[x + y_lcu * new_width] = + CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta); + } + + } else { + // Otherwise, simply write qp_delta to the ROI array. + new_array[y_lcu] = qp_delta; + } + } + + // Update new values + frame->roi.width = new_width; + frame->roi.height = new_height; + frame->roi.roi_array = new_array; + FREE_POINTER(orig_roi); +} + + +static void next_roi_frame_from_file(kvz_picture *frame, FILE *file, enum kvz_roi_format format) { + // The ROI description is as follows: + // First number is width, second number is height, + // then follows width * height number of dqp values. + + // Rewind the (seekable) ROI file when end of file is reached. + // Allows a single ROI frame to be used for a whole sequence + // and looping with --loop-input. Skips possible whitespace. + if (ftell(file) != -1L) { + int c = fgetc(file); + while (format == KVZ_ROI_TXT && isspace(c)) c = fgetc(file); + ungetc(c, file); + if (c == EOF) rewind(file); + } + + int *width = &frame->roi.width; + int *height = &frame->roi.height; + + bool failed = false; + + if (format == KVZ_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height); + if (format == KVZ_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2; + + if (failed) { + fprintf(stderr, "Failed to read ROI size.\n"); + fclose(file); + assert(0); + } + + if (*width <= 0 || *height <= 0) { + fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height); + fclose(file); + assert(0); + } + + if (*width > 10000 || *height > 10000) { + fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); + fclose(file); + assert(0); + } + + const unsigned size = (*width) * (*height); + int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0])); + if (!dqp_array) { + fprintf(stderr, "Failed to allocate memory for ROI table.\n"); + fclose(file); + assert(0); + } + + FREE_POINTER(frame->roi.roi_array); + frame->roi.roi_array = dqp_array; + + if (format == KVZ_ROI_TXT) { + for (int i = 0; i < size; ++i) { + int number; // Need a pointer to int for fscanf + if (fscanf(file, "%d", &number) != 1) { + fprintf(stderr, "Reading ROI file failed.\n"); + fclose(file); + assert(0); + } + dqp_array[i] = CLIP(-51, 51, number); + } + } else if (format == KVZ_ROI_BIN) { + if (fread(dqp_array, 1, size, file) != size) { + fprintf(stderr, "Reading ROI file failed.\n"); + assert(0); + } + } +} + static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) { assert(state->type == ENCODER_STATE_TYPE_MAIN); @@ -1265,6 +1423,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict state->tile->frame->height ); + // ROI / delta QP maps + if (frame->roi.roi_array && cfg->roi.file_path) { + assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified."); + } + + // Read frame from the file. If no file is specified, + // ROI data should be already set by the application. + if (cfg->roi.file_path) { + next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format); + } + + if (cfg->erp_aqp) { + init_erp_aqp_roi(state->encoder_control, state->tile->frame->source); + } + // Variance adaptive quantization if (cfg->vaq) { const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1351,6 +1524,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict } // Variance adaptive quantization - END + if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) { + state->frame->max_qp_delta_depth = 0; + } else { + state->frame->max_qp_delta_depth = -1; + } + // Use this flag to handle closed gop irap picture selection. // If set to true, irap is already set and we avoid // setting it based on the intra period @@ -1603,10 +1782,9 @@ lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y) int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp) { - const encoder_control_t *ctrl = state->encoder_control; const cu_array_t *cua = state->tile->frame->cu_array; // Quantization group width - const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); + const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); // Coordinates of the top-left corner of the quantization group const int x_qg = x & ~(qg_width - 1); diff --git a/src/encoderstate.h b/src/encoderstate.h index a65e8b35..00885aa4 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -180,6 +180,8 @@ typedef struct encoder_state_config_frame_t { */ double *aq_offsets; + int8_t max_qp_delta_depth; + /** * \brief Whether next NAL is the first NAL in the access unit. */ @@ -380,10 +382,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state) */ static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth) { - if (state->encoder_control->max_qp_delta_depth < 0) return false; + if (state->frame->max_qp_delta_depth < 0) return false; const int cu_width = LCU_WIDTH >> depth; - const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth; + const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth; const int right = x + cu_width; const int bottom = y + cu_width; return (right % qg_width == 0 || right >= state->tile->frame->width) && diff --git a/src/filter.c b/src/filter.c index d3bdfb7b..510b9ea6 100644 --- a/src/filter.c +++ b/src/filter.c @@ -274,7 +274,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir) static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir) { - if (state->encoder_control->max_qp_delta_depth < 0) { + if (state->frame->max_qp_delta_depth < 0) { return state->qp; } diff --git a/src/kvazaar.h b/src/kvazaar.h index 967a3c67..73c7538d 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -250,6 +250,11 @@ enum kvz_file_format KVZ_FORMAT_YUV = 2 }; +enum kvz_roi_format +{ + KVZ_ROI_TXT = 0, + KVZ_ROI_BIN = 1 +}; // Map from input format to chroma format. #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"[format]) @@ -388,12 +393,9 @@ typedef struct kvz_config int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */ struct { - int32_t width; - int32_t height; - int8_t *dqps; - } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */ - - char *roi_file; + char *file_path; + enum kvz_roi_format format; + } roi; /*!< \brief Specify delta QPs for region of interest coding. */ unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */ @@ -764,6 +766,9 @@ typedef struct kvz_api { * the bitstream, length of the bitstream, the reconstructed frame, the * original frame and frame info in data_out, len_out, pic_out, src_out and * info_out, respectively. Otherwise, set the output parameters to NULL. + * + * Region of interest (ROI) / delta QP map can be specified in the input + * picture's ROI field but only when a ROI file is not used. * * After passing all of the input frames, the caller should keep calling this * function with pic_in set to NULL, until no more data is returned in the diff --git a/src/rate_control.c b/src/rate_control.c index e5620fb0..64983ec1 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -1085,7 +1085,7 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, const encoder_control_t * const ctrl = state->encoder_control; lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y); - if (ctrl->cfg.roi.dqps != NULL || state->tile->frame->source->roi.roi_array) { + if (state->tile->frame->source->roi.roi_array) { vector2d_t lcu_vec = { pos.x + state->tile->lcu_offset_x, pos.y + state->tile->lcu_offset_y @@ -1101,26 +1101,7 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, } state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lambda(state, state->qp); - state->lambda_sqrt = sqrt(state->frame->lambda); - } - else if (ctrl->cfg.roi.dqps != NULL) { - vector2d_t lcu = { - pos.x + state->tile->lcu_offset_x, - pos.y + state->tile->lcu_offset_y - }; - vector2d_t roi = { - lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu, - lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu - }; - int roi_index = roi.x + roi.y * ctrl->cfg.roi.width; - int dqp = ctrl->cfg.roi.dqps[roi_index]; - if (dqp != 0) { - pos.x = 0; - } - state->qp = CLIP_TO_QP(state->frame->QP + dqp); - state->lambda = qp_to_lambda(state, state->qp); state->lambda_sqrt = sqrt(state->lambda); - } else if (ctrl->cfg.target_bitrate > 0) { const uint32_t pixels = MIN(LCU_WIDTH, state->tile->frame->width - LCU_WIDTH * pos.x) * From 85d1a54adc448b85512092e11cb690b4a3297d0b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 17 Mar 2022 14:48:08 +0200 Subject: [PATCH 43/85] Add cli option for forcing inter --- configure.ac | 2 +- src/cfg.c | 4 ++++ src/cli.c | 6 ++++++ src/kvazaar.h | 2 ++ src/search.c | 7 ++++--- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 832b584d..8171fec6 100644 --- a/configure.ac +++ b/configure.ac @@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c]) # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html ver_major=6 -ver_minor=6 +ver_minor=7 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS diff --git a/src/cfg.c b/src/cfg.c index c8a3dfa4..61a23d33 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -185,6 +185,7 @@ int kvz_config_init(kvz_config *cfg) cfg->fastrd_learning_outdir_fn = NULL; cfg->combine_intra_cus = 1; + cfg->force_inter = 0; return 1; } @@ -1426,6 +1427,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) else if OPT("combine-intra-cus") { cfg->combine_intra_cus = atobool(value); } + else if OPT("force-inter") { + cfg->force_inter = atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index 2212aa9b..69fffb3a 100644 --- a/src/cli.c +++ b/src/cli.c @@ -169,6 +169,8 @@ static const struct option long_options[] = { { "fastrd-outdir", required_argument, NULL, 0 }, { "combine-intra-cus", no_argument, NULL, 0 }, { "no-combine-intra-cus", no_argument, NULL, 0 }, + { "force-inter", no_argument, NULL, 0 }, + { "no-force-inter", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -586,6 +588,10 @@ void print_help(void) " be disabled if cus absolutely must not\n" " be larger than limited by the search.\n" " [enabled]" + " --force-inter : Force the encoder to use inter always.\n" + " This is mostly for debugging and is not\n" + " guaranteed to produce sensible bitstream or\n" + " work at all. [disabled]" " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" diff --git a/src/kvazaar.h b/src/kvazaar.h index 0e6779b4..1bd59392 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -482,6 +482,8 @@ typedef struct kvz_config /** \brief whether to try combining intra cus at the lower depth when search * is not performed at said depth*/ uint8_t combine_intra_cus; + + uint8_t force_inter; } kvz_config; /** diff --git a/src/search.c b/src/search.c index d2de84cb..931555f8 100644 --- a/src/search.c +++ b/src/search.c @@ -577,12 +577,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max; bool can_use_intra = - WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || + (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || // When the split was forced because the CTU is partially outside // the frame, we permit intra coding even if pu_depth_intra would // otherwise forbid it. (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width || - (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height; + (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) && + !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I); if (can_use_intra && !skip_intra) { int8_t intra_mode; @@ -710,7 +711,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // If the CU is partially outside the frame, we need to split it even // if pu_depth_intra and pu_depth_inter would not permit it. cur_cu->type == CU_NOTSET || - depth < pu_depth_intra.max || + (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != KVZ_SLICE_I)) || (state->frame->slicetype != KVZ_SLICE_I && depth < pu_depth_inter.max); From d5e4e831f41cd2fb4db3973c396aace58d322c04 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 2 Dec 2021 10:05:21 +0200 Subject: [PATCH 44/85] Preliminary code for outputting bit costs during the search --- src/bitstream.c | 1 + src/cabac.c | 2 ++ src/cabac.h | 30 +++++++++++++++-------- src/encode_coding_tree.c | 4 ++++ src/encoderstate.c | 2 ++ src/global.h | 2 +- src/rdo.h | 2 -- src/sao.c | 6 ++++- src/search.c | 51 ++++++++++++++++++++++++++++------------ src/search.h | 4 ++-- src/search_inter.c | 9 +++++-- src/search_intra.c | 30 +++++++++++++++-------- 12 files changed, 100 insertions(+), 43 deletions(-) diff --git a/src/bitstream.c b/src/bitstream.c index 6a198632..f7433498 100644 --- a/src/bitstream.c +++ b/src/bitstream.c @@ -33,6 +33,7 @@ #include "bitstream.h" #include +#include #include #include diff --git a/src/cabac.c b/src/cabac.c index c0bbb26e..7f5b92c2 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -37,6 +37,8 @@ #include "extras/crypto.h" #include "kvazaar.h" +FILE* bit_cost_file = NULL; + const uint8_t kvz_g_auc_next_state_mps[128] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, diff --git a/src/cabac.h b/src/cabac.h index 3804fdf2..fa17c799 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -42,6 +42,8 @@ #include "bitstream.h" +extern FILE* bit_cost_file; + struct encoder_state_t; // Types @@ -126,6 +128,9 @@ void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, uint32_t max_symbol); void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol); +extern const float kvz_f_entropy_bits[128]; +#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] +extern double bits_written; // Macros #define CTX_STATE(ctx) ((ctx)->uc_state >> 1) @@ -133,24 +138,29 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; } +#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) + #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ - uint32_t prev_state = (data)->ctx->uc_state; \ - kvz_cabac_encode_bin((data), (value)) \ - printf("%s = %u, state = %u -> %u\n", \ - (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); } + uint32_t prev_state = (data)->cur_ctx->uc_state; \ + if(!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (value));\ + kvz_cabac_encode_bin((data), (value)); \ + if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u bits = %f\n", \ + (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx), bits_written); } #define CABAC_BINS_EP(data, value, bins, name) { \ - uint32_t prev_state = (data)->ctx->uc_state; \ + uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bins_ep((data), (value), (bins)); \ - printf("%s = %u(%u bins), state = %u -> %u\n", \ - (name), (uint32_t)(value), (bins), prev_state, (data)->ctx->uc_state); } + if(!(data)->only_count) bits_written += (bins); \ + if(!(data)->only_count) printf("%s = %u(%u bins), state = %u -> %u\n", \ + (name), (uint32_t)(value), (bins), prev_state, (data)->cur_ctx->uc_state); } #define CABAC_BIN_EP(data, value, name) { \ - uint32_t prev_state = (data)->ctx->uc_state; \ + uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bin_ep((data), (value)); \ - printf("%s = %u, state = %u -> %u\n", \ - (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); } + if(!(data)->only_count) bits_written += 1; \ + if(!(data)->only_count) printf("%s = %u, state = %u -> %u\n", \ + (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state); } #else #define CABAC_BIN(data, value, name) \ kvz_cabac_encode_bin((data), (value)); diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 03b04943..0070b718 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -923,6 +923,10 @@ end: if (is_last_cu_in_qg(state, x, y, depth)) { state->last_qp = cur_cu->qp; } + if((x % 64 != 0 && y % 64 != 0) || 1) { + fprintf(stderr, "%f\t%d\t%d\t%d\n", bits_written, x, y, depth); + bits_written = 0; + } } diff --git a/src/encoderstate.c b/src/encoderstate.c index 6e3cf0b4..483dfb6a 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -1655,9 +1655,11 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s } } +double bits_written; void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame) { + bits_written = 0; encoder_state_init_new_frame(state, frame); encoder_state_encode(state); diff --git a/src/global.h b/src/global.h index c6a6ebba..9a2ee989 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -//#define VERBOSE 1 +#define VERBOSE 1 /* CONFIG VARIABLES */ diff --git a/src/rdo.h b/src/rdo.h index 3b56ddcc..dd75fdb9 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -85,7 +85,5 @@ extern const uint32_t kvz_entropy_bits[128]; #define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits[(ctx)->uc_state ^ (val)] // Floating point fractional bits, derived from kvz_entropy_bits -extern const float kvz_f_entropy_bits[128]; -#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] #endif diff --git a/src/sao.c b/src/sao.c index 8da94345..e9fab518 100644 --- a/src/sao.c +++ b/src/sao.c @@ -508,6 +508,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (state->encoder_control->cfg.sao_type & 1){ sao_search_edge_sao(state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left); float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt); + FILE_BITS(mode_bits, 0, 0, 0, "sao mode bits"); int ddistortion = (int)(mode_bits * state->lambda + 0.5); unsigned buf_i; @@ -552,7 +553,9 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ // Choose between SAO and doing nothing, taking into account the // rate-distortion cost of coding do nothing. { - int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5); + float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left); + int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5); + FILE_BITS(mode_bits_none, 0, 0, 0, "Sao cost of nothing"); if (sao_out->ddistortion >= cost_of_nothing) { sao_out->type = SAO_TYPE_NONE; merge_cost[0] = cost_of_nothing; @@ -569,6 +572,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (merge_cand) { unsigned buf_i; float mode_bits = sao_mode_bits_merge(state, i + 1); + FILE_BITS(mode_bits, 0, 0, 0, (i == 0 ? "sao merge ""left" : "sao merge ""top")); int ddistortion = (int)(mode_bits * state->lambda + 0.5); switch (merge_cand->type) { diff --git a/src/search.c b/src/search.c index 7b343d2e..1fc47a06 100644 --- a/src/search.c +++ b/src/search.c @@ -248,7 +248,8 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu) + lcu_t *const lcu, + double *bit_cost) { const int width = LCU_WIDTH >> depth; @@ -272,16 +273,17 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); + *bit_cost += tr_tree_bits; } if (tr_depth > 0) { int offset = width / 2; double sum = 0; - sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); return sum + tr_tree_bits * state->lambda; } @@ -294,6 +296,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[!tr_depth]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + *bit_cost += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + } // SSD between reconstruction and original @@ -310,6 +314,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode); + *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -320,7 +325,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu) + lcu_t *const lcu, + double *bit_cost) { const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; @@ -347,16 +353,17 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); } + *bit_cost += tr_tree_bits; } if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); int sum = 0; - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); return sum + tr_tree_bits * state->lambda; } @@ -380,6 +387,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order); coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order); + *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -690,9 +698,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { - cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); + double bits = 0; + cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); + cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); } double mode_bits; @@ -701,6 +710,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else { mode_bits = inter_bitcost; } + bits += mode_bits; + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + const cabac_ctx_t* ctx = &(state->cabac.ctx.split_flag_model[split_model]); + // bits += CTX_ENTROPY_FBITS(ctx, 0); + FILE_BITS(bits, x, y, depth, "final rd bits"); cost += mode_bits * state->lambda; @@ -746,14 +760,18 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "not split"); split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]); cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; // 2Nx2N + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "not split"); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); } // If skip mode was selected for the block, skip further search. @@ -783,6 +801,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // If the best CU in depth+1 is intra and the biggest it can be, try it. if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { cost = 0; + double bits = 0; cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; @@ -799,11 +818,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode, mode_chroma, NULL, lcu); - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); + cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (has_chroma) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); + cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, &bits); } - + + FILE_BITS(bits, x, y, depth, "merged intra bits"); // Add the cost of coding no-split. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); @@ -979,6 +999,7 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { + if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); assert(x % LCU_WIDTH == 0); assert(y % LCU_WIDTH == 0); diff --git a/src/search.h b/src/search.h index e4b299c3..2ca47c22 100644 --- a/src/search.h +++ b/src/search.h @@ -72,11 +72,11 @@ void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu); + lcu_t *const lcu, double *bits); double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu); + lcu_t *const lcu, double* bits); void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index 08594b9f..f8b88509 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1967,6 +1967,8 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); } + + FILE_BITS((double)info->inter_bitcost, x, y, depth, "regular inter bitcost"); } /** @@ -2009,11 +2011,14 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, lcu, false); - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + double bits; + *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); } + FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); + *inter_cost += *inter_bitcost * state->lambda; } diff --git a/src/search_intra.c b/src/search_intra.c index 6d3aa141..bd259e22 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -179,7 +179,8 @@ static double search_intra_trdepth(encoder_state_t * const state, int x_px, int y_px, int depth, int max_depth, int intra_mode, int cost_treshold, cu_info_t *const pred_cu, - lcu_t *const lcu) + lcu_t *const lcu, + double *bit_cost) { assert(depth >= 0 && depth <= MAX_PU_DEPTH); @@ -201,6 +202,7 @@ static double search_intra_trdepth(encoder_state_t * const state, double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; + double nosplit_bits = 0; if (depth > 0) { tr_cu->tr_depth = depth; @@ -221,9 +223,9 @@ static double search_intra_trdepth(encoder_state_t * const state, intra_mode, chroma_mode, pred_cu, lcu); - nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); if (reconstruct_chroma) { - nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); } // Early stop codition for the recursive search. @@ -250,15 +252,15 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth < max_depth && depth < MAX_PU_DEPTH) { split_cost = 3 * state->lambda; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); } double tr_split_bit = 0.0; @@ -269,6 +271,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth >= 1 && depth <= 3) { const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); + *bit_cost += tr_split_bit; } // Add cost of cbf chroma bits on transform tree. @@ -287,6 +290,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); } + *bit_cost += cbf_bits; } double bits = tr_split_bit + cbf_bits; @@ -608,7 +612,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); + double bit_costs = 0; + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, &bit_costs); costs[rdo_mode] += mode_cost; // Early termination if no coefficients has to be coded @@ -621,6 +626,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Update order according to new costs kvz_sort_modes(modes, costs, modes_to_check); + // The best transform split hierarchy is not saved anywhere, so to get the // transform split hierarchy the search has to be performed again with the // best mode. @@ -632,7 +638,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.intra.mode = modes[0]; pred_cu.intra.mode_chroma = modes[0]; FILL(pred_cu.cbf, 0); - search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu); + double bit_cost = 0; + search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, &bit_cost); + FILE_BITS(bit_cost, x_px, y_px, depth, "tr_depth bits"); } return modes_to_check; @@ -705,9 +713,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, depth, -1, chroma.mode, // skip luma NULL, lcu); - chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + double bits = 0; + chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &bits); double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); + bits += mode_bits; chroma.cost += mode_bits * state->lambda; if (chroma.cost < best_chroma.cost) { From 53264bc764c2ef2354571f83d42fda8381c7d930 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 3 Dec 2021 09:09:57 +0200 Subject: [PATCH 45/85] Update cabac context during search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a separate cabac that is only used during the search. It should hold the state that the actual cabac end after encoding said CU. Only implemented for intra so far. TODO: 4×4 PUs probably still have some problems --- src/cabac.h | 3 +- src/encoderstate.c | 4 +++ src/encoderstate.h | 1 + src/rdo.c | 11 ++++--- src/sao.c | 8 +++--- src/search.c | 71 ++++++++++++++++++++++++++++++---------------- src/search_intra.c | 39 ++++++++++++++++++++----- 7 files changed, 97 insertions(+), 40 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index fa17c799..7dd65a54 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -60,7 +60,8 @@ typedef struct uint32_t buffered_byte; int32_t num_buffered_bytes; int32_t bits_left; - int8_t only_count; + int8_t only_count : 4; + int8_t update : 4; bitstream_t *stream; // CONTEXTS diff --git a/src/encoderstate.c b/src/encoderstate.c index 483dfb6a..012476df 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -731,6 +731,8 @@ static void encoder_state_worker_encode_lcu(void * opaque) kvz_bitstream_align_zero(state->cabac.stream); kvz_cabac_start(&state->cabac); + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; kvz_crypto_delete(&state->crypto_hdl); } @@ -1214,6 +1216,8 @@ static void encoder_state_init_children(encoder_state_t * const state) { //Leaf states have cabac and context kvz_cabac_start(&state->cabac); kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype); + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; } //Clear the jobs diff --git a/src/encoderstate.h b/src/encoderstate.h index 00885aa4..ac62a5a7 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -294,6 +294,7 @@ typedef struct encoder_state_t { bitstream_t stream; cabac_data_t cabac; + cabac_data_t search_cabac; // Crypto stuff crypto_handle_t *crypto_hdl; diff --git a/src/rdo.c b/src/rdo.c index 5403fa61..6b8960ee 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -253,12 +253,12 @@ static INLINE uint32_t get_coeff_cabac_cost( // Take a copy of the CABAC so that we don't overwrite the contexts when // counting the bits. cabac_data_t cabac_copy; - memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy)); + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); // Clear bytes and bits and set mode to "count" cabac_copy.only_count = 1; - cabac_copy.num_buffered_bytes = 0; - cabac_copy.bits_left = 23; + int num_buffered_bytes = cabac_copy.num_buffered_bytes; + int bits_left = cabac_copy.bits_left; // Execute the coding function. // It is safe to drop the const modifier since state won't be modified @@ -270,8 +270,11 @@ static INLINE uint32_t get_coeff_cabac_cost( type, scan_mode, 0); + if(cabac_copy.update) { - return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3); + memcpy(&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); + } + return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3); } static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc) diff --git a/src/sao.c b/src/sao.c index e9fab518..35be7176 100644 --- a/src/sao.c +++ b/src/sao.c @@ -52,7 +52,7 @@ static void init_sao_info(sao_info_t *sao) { static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { @@ -74,7 +74,7 @@ static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t static float sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); @@ -91,7 +91,7 @@ static float sao_mode_bits_edge(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { @@ -131,7 +131,7 @@ static float sao_mode_bits_band(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { diff --git a/src/search.c b/src/search.c index 1fc47a06..2cb34608 100644 --- a/src/search.c +++ b/src/search.c @@ -245,7 +245,7 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, * Takes into account SSD of reconstruction and the cost of encoding whatever * prediction unit data needs to be coded. */ -double kvz_cu_rd_cost_luma(const encoder_state_t *const state, +double kvz_cu_rd_cost_luma(encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, lcu_t *const lcu, @@ -271,8 +271,12 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, && width > TR_MIN_WIDTH && !intra_split_flag) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); + if (state->search_cabac.update) { + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, tr_depth > 0, "tr_split_search"); + } *bit_cost += tr_tree_bits; } @@ -294,9 +298,14 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[!tr_depth]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); - *bit_cost += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_luma[!tr_depth]); + int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + if (state->search_cabac.update) { + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, is_set, "luma_cbf_search"); + } + *bit_cost += CTX_ENTROPY_FBITS(ctx, is_set); } @@ -346,7 +355,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (depth < MAX_PU_DEPTH) { const int tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); } @@ -494,6 +503,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double inter_zero_coeff_cost = MAX_DOUBLE; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; + cabac_data_t pre_search_cabac; + memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); struct { int32_t min; @@ -699,24 +710,31 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { double bits = 0; - cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); - } + state->search_cabac.update = 1; + + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + state->search_cabac.cur_ctx = ctx; + // TODO: intra 4x4 PUs use different method + bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_BIN(&state->search_cabac, 0, "no_split_search"); double mode_bits; if (cur_cu->type == CU_INTRA) { mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); - } else { + } + else { mode_bits = inter_bitcost; } bits += mode_bits; - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t* ctx = &(state->cabac.ctx.split_flag_model[split_model]); - // bits += CTX_ENTROPY_FBITS(ctx, 0); - FILE_BITS(bits, x, y, depth, "final rd bits"); + cost = mode_bits * state->lambda; - cost += mode_bits * state->lambda; + cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); + } + + FILE_BITS(bits, x, y, depth, "final rd bits"); if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { cost = inter_zero_coeff_cost; @@ -739,7 +757,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->cbf = 0; lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } - } + state->search_cabac.update = 0; + } bool can_split_cu = // If the CU is partially outside the frame, we need to split it even @@ -754,24 +773,27 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int half_cu = cu_width / 2; double split_cost = 0.0; int cbf = cbf_is_set_any(cur_cu->cbf, depth); + cabac_data_t post_seach_cabac; + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "not split"); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; + state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. - const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]); - cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; // 2Nx2N - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "not split"); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN + state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); } // If skip mode was selected for the block, skip further search. @@ -826,7 +848,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, FILE_BITS(bits, x, y, depth, "merged intra bits"); // Add the cost of coding no-split. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // Add the cost of coding intra mode only once. @@ -845,6 +867,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if (depth > 0) { // Copy this CU's mode all the way down for use in adjacent CUs mode // search. + memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac)); work_tree_copy_down(x_local, y_local, depth, work_tree); } } else if (depth >= 0 && depth < MAX_PU_DEPTH) { diff --git a/src/search_intra.c b/src/search_intra.c index bd259e22..ccf1ca91 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -98,11 +98,11 @@ static double get_cost(encoder_state_t * const state, // Add the offset bit costs of signaling 'luma and chroma use trskip', // versus signaling 'luma and chroma don't use trskip' to the SAD cost. - const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma; + const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma; double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0); if (state->encoder_control->chroma_format != KVZ_CSP_400) { - ctx = &state->cabac.ctx.transform_skip_model_chroma; + ctx = &state->search_cabac.ctx.transform_skip_model_chroma; trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); } @@ -269,7 +269,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // Add bits for split_transform_flag = 1, because transform depth search bypasses // the normal recursion in the cost functions. if (depth >= 1 && depth <= 3) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); *bit_cost += tr_split_bit; } @@ -283,7 +283,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (state->encoder_control->chroma_format != KVZ_CSP_400) { const uint8_t tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); } @@ -647,8 +647,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } -double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) +double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) { + cabac_data_t* cabac = &state->search_cabac; double mode_bits; bool mode_in_preds = false; @@ -658,8 +659,23 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const } } - const cabac_ctx_t *ctx = &(state->cabac.ctx.intra_mode_model); + const cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds); + if (state->search_cabac.update) { + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, mode_in_preds, "prev_intra_luma_pred_flag_search"); + if(mode_in_preds) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); + if(luma_mode != intra_preds[0]) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx"); + } + } + else { + // This value should be transformed for actual coding, + // but here the value does not actually matter, just that we write 5 bits + CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode"); + } + } if (mode_in_preds) { mode_bits += ((luma_mode == intra_preds[0]) ? 1 : 2); @@ -673,13 +689,22 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model[0]); + cabac_data_t* cabac = &state->search_cabac; + const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); double mode_bits; if (chroma_mode == luma_mode) { mode_bits = CTX_ENTROPY_FBITS(ctx, 0); } else { mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1); } + if(cabac->update) { + cabac->cur_ctx = ctx; + CABAC_BIN(cabac, chroma_mode != luma_mode, "intra_chroma_pred_mode"); + if(chroma_mode != luma_mode) { + // Again it does not matter what we actually write here + CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode"); + } + } return mode_bits; } From 9ed8d0a7d9e6570763ca31e4a928ecb7fec8b3b9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 7 Dec 2021 08:13:08 +0200 Subject: [PATCH 46/85] count all non-tr-depth related bits correctly --- src/cabac.c | 1 + src/search.c | 74 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index 7f5b92c2..5842edbe 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -97,6 +97,7 @@ void kvz_cabac_start(cabac_data_t * const data) data->num_buffered_bytes = 0; data->buffered_byte = 0xff; data->only_count = 0; // By default, write bits out + data->update = 0; } /** diff --git a/src/search.c b/src/search.c index 2cb34608..a0534bf4 100644 --- a/src/search.c +++ b/src/search.c @@ -265,17 +265,27 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, const uint8_t tr_depth = tr_cu->tr_depth - depth; + cabac_data_t* cabac = &state->search_cabac; + // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; + int max_tr_depth; + if (tr_cu->type == CU_INTRA) { + max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag; + } + else { + max_tr_depth = state->encoder_control->tr_depth_inter; + } if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH - && !intra_split_flag) + && !intra_split_flag + && tr_depth < max_tr_depth) { - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + const cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); - if (state->search_cabac.update) { - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, tr_depth > 0, "tr_split_search"); + if (cabac->update) { + cabac->cur_ctx = ctx; + CABAC_BIN(cabac, tr_depth > 0, "tr_split_search"); } *bit_cost += tr_tree_bits; } @@ -298,14 +308,28 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_luma[!tr_depth]); + const cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); - if (state->search_cabac.update) { + if (cabac->update) { + // Because these need to be coded before the luma cbf they also need to be counted + // before the cabac state changes. However, since this branch is only executed when + // calculating the last RD cost it is not problem to include the chroma cbf costs in + // luma, because the chroma cost is calculated right after the luma cost. + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + const cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac->cur_ctx = cr_ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, u_is_set); + CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); + tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, v_is_set); + CABAC_BIN(cabac, v_is_set, "cbf_cr_search"); + } + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + *bit_cost += tr_tree_bits; state->search_cabac.cur_ctx = ctx; CABAC_BIN(&state->search_cabac, is_set, "luma_cbf_search"); } - *bit_cost += CTX_ENTROPY_FBITS(ctx, is_set); } @@ -353,7 +377,8 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, return 0; } - if (depth < MAX_PU_DEPTH) { + // See luma for why the second condition + if (depth < MAX_PU_DEPTH && !state->search_cabac.update) { const int tr_depth = depth - pred_cu->depth; const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { @@ -712,12 +737,21 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double bits = 0; state->search_cabac.update = 1; - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - state->search_cabac.cur_ctx = ctx; - // TODO: intra 4x4 PUs use different method - bits += CTX_ENTROPY_FBITS(ctx, 0); - CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + if(depth < MAX_DEPTH) { + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + state->search_cabac.cur_ctx = ctx; + bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + } + else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { + // Add cost of intra part_size. + const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN + state->search_cabac.cur_ctx = ctx; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); + } double mode_bits; if (cur_cu->type == CU_INTRA) { @@ -776,6 +810,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_data_t post_seach_cabac; memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); + state->search_cabac.update = 1; if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. @@ -792,9 +827,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 0, "split_search"); } + state->search_cabac.update = 0; // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting @@ -1023,6 +1059,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); assert(y % LCU_WIDTH == 0); From d2299adb1c301d1a3d723cf7832a81bfb0e6bc9e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 7 Dec 2021 09:11:47 +0200 Subject: [PATCH 47/85] Disable bit debug code when VERBOSE is not defined and count bits when combining the intra cus --- src/cabac.h | 6 +++++- src/encode_coding_tree.c | 3 +++ src/global.h | 2 +- src/search.c | 29 +++++++++++++++++++++++++++-- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index 7dd65a54..59fb448c 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -139,7 +139,11 @@ extern double bits_written; #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; } -#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) +#ifdef VERBOSE +#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) +#else +#define FILE_BITS(bits, x, y, depth, name) {} +#endif #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 0070b718..aa083f5b 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -923,10 +923,13 @@ end: if (is_last_cu_in_qg(state, x, y, depth)) { state->last_qp = cur_cu->qp; } +#ifdef VERBOSE if((x % 64 != 0 && y % 64 != 0) || 1) { fprintf(stderr, "%f\t%d\t%d\t%d\n", bits_written, x, y, depth); bits_written = 0; } +#endif + } diff --git a/src/global.h b/src/global.h index 9a2ee989..2ad0830b 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -#define VERBOSE 1 +// #define VERBOSE 1 /* CONFIG VARIABLES */ diff --git a/src/search.c b/src/search.c index a0534bf4..ce521e23 100644 --- a/src/search.c +++ b/src/search.c @@ -854,12 +854,31 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, && x + cu_width <= frame->width && y + cu_width <= frame->height && state->encoder_control->cfg.combine_intra_cus) { + cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); // If the best CU in depth+1 is intra and the biggest it can be, try it. if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { + cabac_data_t temp_cabac; + memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; double bits = 0; + if (depth < MAX_DEPTH) { + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + state->search_cabac.cur_ctx = ctx; + bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + } + else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { + // Add cost of intra part_size. + const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN + state->search_cabac.cur_ctx = ctx; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); + } cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; @@ -876,6 +895,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode, mode_chroma, NULL, lcu); + double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); + cost += mode_bits * state->lambda; + cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (has_chroma) { cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, &bits); @@ -888,8 +910,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // Add the cost of coding intra mode only once. - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); - cost += mode_bits * state->lambda; + + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); } } @@ -1058,7 +1081,9 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { +#ifdef VERBOSE if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); +#endif memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); From dacc15f33be504cf4a28da6cb23a3517e4cbb70f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 8 Dec 2021 10:27:07 +0200 Subject: [PATCH 48/85] Count pred mode bit --- src/search.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/search.c b/src/search.c index ce521e23..bf37640f 100644 --- a/src/search.c +++ b/src/search.c @@ -628,6 +628,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); } } + cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 0) * state->lambda; } } @@ -654,6 +655,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double intra_cost; kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_cost); + if(state->frame->slicetype != KVZ_SLICE_I) { + intra_cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 1) * state->lambda; + } if (intra_cost < cost) { cost = intra_cost; cur_cu->type = CU_INTRA; @@ -755,6 +759,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double mode_bits; if (cur_cu->type == CU_INTRA) { + cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); + bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, 1, "pred_mode"); mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); } else { From de3a76d8747fc9ddb35375ae5d25b8dee51ff27a Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 8 Dec 2021 11:48:46 +0200 Subject: [PATCH 49/85] Correctly calculate bits for transform split, however updating is done incorrectly, but cannot be fixed easily unfortunately --- src/search.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/search.c b/src/search.c index bf37640f..c1947957 100644 --- a/src/search.c +++ b/src/search.c @@ -245,7 +245,7 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, * Takes into account SSD of reconstruction and the cost of encoding whatever * prediction unit data needs to be coded. */ -double kvz_cu_rd_cost_luma(encoder_state_t *const state, +double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, lcu_t *const lcu, @@ -265,7 +265,7 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, const uint8_t tr_depth = tr_cu->tr_depth - depth; - cabac_data_t* cabac = &state->search_cabac; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; @@ -308,9 +308,9 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (cabac->update) { + if (cabac->update && tr_cu->tr_depth == 0) { // Because these need to be coded before the luma cbf they also need to be counted // before the cabac state changes. However, since this branch is only executed when // calculating the last RD cost it is not problem to include the chroma cbf costs in @@ -325,10 +325,12 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, v_is_set); CABAC_BIN(cabac, v_is_set, "cbf_cr_search"); } - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); - *bit_cost += tr_tree_bits; - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, is_set, "luma_cbf_search"); + } + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + *bit_cost += tr_tree_bits; + if(cabac->update) { + cabac->cur_ctx = ctx; + CABAC_BIN(cabac, is_set, "luma_cbf_search"); } } @@ -378,14 +380,20 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } // See luma for why the second condition - if (depth < MAX_PU_DEPTH && !state->search_cabac.update) { + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth)) { const int tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); + cabac->cur_ctx = ctx; if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, u_is_set); + if(state->search_cabac.update) CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); } if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, v_is_set); + if (state->search_cabac.update) CABAC_BIN(cabac, v_is_set, "cbf_cb_search"); } *bit_cost += tr_tree_bits; } From 1fb69d5e2271d750dc8b68a02bffaab8fd300fb5 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 9 Dec 2021 13:19:42 +0200 Subject: [PATCH 50/85] Correct tr depth bit calculation --- src/search.c | 22 +++++++++++++--------- src/search_intra.c | 2 +- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/search.c b/src/search.c index c1947957..cd4c67b7 100644 --- a/src/search.c +++ b/src/search.c @@ -270,7 +270,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; int max_tr_depth; - if (tr_cu->type == CU_INTRA) { + if (pred_cu->type == CU_INTRA) { max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag; } else { @@ -279,9 +279,9 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag - && tr_depth < max_tr_depth) + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) { - const cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); + cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); if (cabac->update) { cabac->cur_ctx = ctx; @@ -310,11 +310,13 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (cabac->update && tr_cu->tr_depth == 0) { + if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { // Because these need to be coded before the luma cbf they also need to be counted // before the cabac state changes. However, since this branch is only executed when // calculating the last RD cost it is not problem to include the chroma cbf costs in // luma, because the chroma cost is calculated right after the luma cost. + // However, if we have different tr_depth, the bits cannot be written in correct + // order anyways so do not touch the chroma cbf here. if (state->encoder_control->chroma_format != KVZ_CSP_400) { const cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); cabac->cur_ctx = cr_ctx; @@ -380,7 +382,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } // See luma for why the second condition - if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth)) { + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth)) { const int tr_depth = depth - pred_cu->depth; cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); @@ -767,10 +769,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double mode_bits; if (cur_cu->type == CU_INTRA) { - cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); - bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, 1, "pred_mode"); + if(state->frame->slicetype != KVZ_SLICE_I) { + cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); + bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, 1, "pred_mode"); + } mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); } else { diff --git a/src/search_intra.c b/src/search_intra.c index ccf1ca91..ac72bd44 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -250,7 +250,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // max_depth. // - Min transform size hasn't been reached (MAX_PU_DEPTH). if (depth < max_depth && depth < MAX_PU_DEPTH) { - split_cost = 3 * state->lambda; + split_cost = 0; split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); if (split_cost < nosplit_cost) { From 311fceade7f8c94009f8b7b68b9fe6da729862ab Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 Dec 2021 08:30:06 +0200 Subject: [PATCH 51/85] Force use inter --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index f8b88509..57e163f4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1968,7 +1968,6 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[2]); } - FILE_BITS((double)info->inter_bitcost, x, y, depth, "regular inter bitcost"); } /** @@ -2122,6 +2121,7 @@ void kvz_search_cu_inter(encoder_state_t * const state, if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } + FILE_BITS((double)*inter_bitcost, x, y, depth, "regular inter bitcost"); } From d8648fe1de496e49cf92baf2d5f1ab70425fb3cd Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 13 Dec 2021 10:43:19 +0200 Subject: [PATCH 52/85] Preparation for accurately counting inter bits --- src/encode_coding_tree.c | 20 +++++-- src/encode_coding_tree.h | 6 +++ src/fast_coeff_cost.c | 6 +-- src/fast_coeff_cost.h | 2 +- src/global.h | 2 +- src/inter.c | 4 +- src/inter.h | 2 +- src/search.c | 14 ++--- src/search.h | 2 +- src/search_inter.c | 112 +++++++++++++++++++-------------------- src/search_inter.h | 6 +-- 11 files changed, 96 insertions(+), 80 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index aa083f5b..76f0cc7e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -308,11 +308,11 @@ static void encode_transform_coeff(encoder_state_t * const state, } } -static void encode_inter_prediction_unit(encoder_state_t * const state, +void kvz_encode_inter_prediction_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int x, int y, int width, int height, - int depth) + int depth, lcu_t* lcu) { // Mergeflag int16_t num_cand = 0; @@ -385,10 +385,20 @@ static void encode_inter_prediction_unit(encoder_state_t * const state, if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) { int16_t mv_cand[2][2]; - kvz_inter_get_mv_cand_cua( + if (lcu) { + kvz_inter_get_mv_cand( + state, + x, y, width, height, + mv_cand, cur_cu, + lcu, ref_list_idx); + } + else { + kvz_inter_get_mv_cand_cua( state, x, y, width, height, - mv_cand, cur_cu, ref_list_idx); + mv_cand, cur_cu, ref_list_idx + ); + } uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); const int32_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; @@ -855,7 +865,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth); + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL); } { diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 4832eeb1..a3f95b36 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -51,6 +51,12 @@ void kvz_encode_mvd(encoder_state_t * const state, int32_t mvd_hor, int32_t mvd_ver); +void kvz_encode_inter_prediction_unit(encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int width, int height, + int depth, lcu_t* lcu); + void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, diff --git a/src/fast_coeff_cost.c b/src/fast_coeff_cost.c index d769791d..1abb5114 100644 --- a/src/fast_coeff_cost.c +++ b/src/fast_coeff_cost.c @@ -40,7 +40,7 @@ static uint16_t to_q88(float f) return (uint16_t)(f * 256.0f + 0.5f); } -static uint64_t to_4xq88(const float f[4]) +static uint64_t to_4xq88(const double f[4]) { int i; uint64_t result = 0; @@ -58,9 +58,9 @@ int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_ uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp; for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) { - float curr_wts[4]; + double curr_wts[4]; - if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0, + if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0, curr_wts + 1, curr_wts + 2, curr_wts + 3) != 4) { diff --git a/src/fast_coeff_cost.h b/src/fast_coeff_cost.h index 5ae6dc25..dee647f7 100644 --- a/src/fast_coeff_cost.h +++ b/src/fast_coeff_cost.h @@ -45,7 +45,7 @@ typedef struct { // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from // 0 to MAX_FAST_COEFF_COST_QP -static const float default_fast_coeff_cost_wts[][4] = { +static const double default_fast_coeff_cost_wts[][4] = { // Just extend it by stretching the first actual values.. {0.164240, 4.161530, 3.509033, 6.928047}, {0.164240, 4.161530, 3.509033, 6.928047}, diff --git a/src/global.h b/src/global.h index 2ad0830b..9a2ee989 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -// #define VERBOSE 1 +#define VERBOSE 1 /* CONFIG VARIABLES */ diff --git a/src/inter.c b/src/inter.c index 02ea1a95..d6b83090 100644 --- a/src/inter.c +++ b/src/inter.c @@ -1228,7 +1228,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, int32_t width, int32_t height, const merge_candidates_t *merge_cand, - const cu_info_t *cur_cu, + const cu_info_t * const cur_cu, int8_t reflist, int16_t mv_cand[2][2]) { @@ -1335,7 +1335,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, int16_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t * const cur_cu, lcu_t *lcu, int8_t reflist) { diff --git a/src/inter.h b/src/inter.h index 1a46e98a..7b5c4ea7 100644 --- a/src/inter.h +++ b/src/inter.h @@ -88,7 +88,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, int16_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t* cur_cu, lcu_t *lcu, int8_t reflist); diff --git a/src/search.c b/src/search.c index cd4c67b7..553c4380 100644 --- a/src/search.c +++ b/src/search.c @@ -318,7 +318,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, // However, if we have different tr_depth, the bits cannot be written in correct // order anyways so do not touch the chroma cbf here. if (state->encoder_control->chroma_format != KVZ_CSP_400) { - const cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); cabac->cur_ctx = cr_ctx; int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); @@ -536,7 +536,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int cu_width = LCU_WIDTH >> depth; double cost = MAX_DOUBLE; double inter_zero_coeff_cost = MAX_DOUBLE; - uint32_t inter_bitcost = MAX_INT; + double inter_bitcost = MAX_INT; cu_info_t *cur_cu; cabac_data_t pre_search_cabac; memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); @@ -600,7 +600,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (can_use_inter) { double mode_cost; - uint32_t mode_bitcost; + double mode_bitcost; kvz_search_cu_inter(state, x, y, depth, @@ -760,7 +760,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. - const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); @@ -835,7 +835,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); @@ -844,7 +844,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "split"); @@ -893,7 +893,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. - const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); diff --git a/src/search.h b/src/search.h index 2ca47c22..b11a0ad5 100644 --- a/src/search.h +++ b/src/search.h @@ -59,7 +59,7 @@ typedef struct unit_stats_map_t { cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units double cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs - uint32_t bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs + double bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs int8_t keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays int size; //!< number of active elements in the lists } unit_stats_map_t; diff --git a/src/search_inter.c b/src/search_inter.c index 57e163f4..983ffcc8 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -199,15 +199,15 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int * \return true if best_mv was changed, false otherwise */ static bool check_mv_cost(inter_search_info_t *info, - int x, - int y, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + int x, + int y, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { if (!intmv_within_tile(info, x, y)) return false; - uint32_t bitcost = 0; + double bitcost = 0; double cost = kvz_image_calc_sad( info->pic, info->ref, @@ -292,10 +292,10 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv) * best_mv to the best one. */ static void select_starting_point(inter_search_info_t *info, - vector2d_t extra_mv, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv); @@ -394,9 +394,9 @@ static double calc_mvd_cost(const encoder_state_t *state, inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost) + double* bitcost) { - uint32_t temp_bitcost = 0; + double temp_bitcost = 0; uint32_t merge_idx; int8_t merged = 0; @@ -429,9 +429,9 @@ static double calc_mvd_cost(const encoder_state_t *state, static bool early_terminate(inter_search_info_t *info, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { static const vector2d_t small_hexbs[7] = { { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, @@ -485,7 +485,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info, vector2d_t mv, int *best_dist, double *best_cost, - uint32_t *best_bits, + double* best_bits, vector2d_t *best_mv) { assert(pattern_type < 4); @@ -603,7 +603,7 @@ void kvz_tz_raster_search(inter_search_info_t *info, int iSearchRange, int iRaster, double *best_cost, - uint32_t *best_bits, + double* best_bits, vector2d_t *best_mv) { const vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; @@ -618,10 +618,10 @@ void kvz_tz_raster_search(inter_search_info_t *info, static void tz_search(inter_search_info_t *info, - vector2d_t extra_mv, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { //TZ parameters const int iSearchRange = 96; // search range for each stage @@ -705,11 +705,11 @@ static void tz_search(inter_search_info_t *info, * points like 0,0 might be used, such as vectors from top or left. */ static void hexagon_search(inter_search_info_t *info, - vector2d_t extra_mv, - uint32_t steps, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -803,11 +803,11 @@ static void hexagon_search(inter_search_info_t *info, * points like 0,0 might be used, such as vectors from top or left. **/ static void diamond_search(inter_search_info_t *info, - vector2d_t extra_mv, - uint32_t steps, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { enum diapos { DIA_UP = 0, @@ -888,7 +888,7 @@ static void search_mv_full(inter_search_info_t *info, int32_t search_range, vector2d_t extra_mv, double *best_cost, - uint32_t *best_bits, + double* best_bits, vector2d_t *best_mv) { // Search around the 0-vector. @@ -968,7 +968,7 @@ static void search_mv_full(inter_search_info_t *info, */ static void search_frac(inter_search_info_t *info, double *best_cost, - uint32_t *best_bits, + double *best_bits, vector2d_t *best_mv) { // Map indexes to relative coordinates in the following way: @@ -985,8 +985,8 @@ static void search_frac(inter_search_info_t *info, vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; double cost = MAX_DOUBLE; - uint32_t bitcost = 0; - uint32_t bitcosts[4] = { 0 }; + double bitcost = 0; + double bitcosts[4] = { 0 }; unsigned best_index = 0; // Keep this as unsigned until SAD / SATD functions are updated @@ -1314,7 +1314,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, } double best_cost = MAX_DOUBLE; - uint32_t best_bits = MAX_INT; + double best_bits = MAX_INT; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -1338,12 +1338,12 @@ static void search_pu_inter_ref(inter_search_info_t *info, case KVZ_IME_DIA: diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); + &best_cost, &best_bits, &best_mv); break; default: hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); + &best_cost, &best_bits, &best_mv); break; } } @@ -1484,7 +1484,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->width); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; cost += info->mvd_cost_func(info->state, merge_cand[i].mv[0][0], @@ -1827,7 +1827,7 @@ static void search_pu_inter(encoder_state_t * const state, list); double frac_cost = MAX_DOUBLE; - uint32_t frac_bits = MAX_INT; + double frac_bits = MAX_INT; vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; search_frac(info, &frac_cost, &frac_bits, &frac_mv); @@ -1917,7 +1917,7 @@ static void search_pu_inter(encoder_state_t * const state, best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; best_bipred_cost += info->mvd_cost_func(info->state, bipred_pu->inter.mv[0][0], @@ -1990,10 +1990,10 @@ static void search_pu_inter(encoder_state_t * const state, * \param inter_bitcost Return inter bitcost */ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost){ + int x, int y, int depth, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost){ cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); int tr_depth = MAX(1, depth); @@ -2040,7 +2040,7 @@ void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; @@ -2108,10 +2108,10 @@ void kvz_search_cu_inter(encoder_state_t * const state, // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + x, y, depth, + lcu, + inter_cost, + inter_bitcost); } if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { @@ -2146,7 +2146,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; @@ -2173,7 +2173,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, const int height_pu = PU_GET_H(part_mode, width, i); double cost = MAX_DOUBLE; - uint32_t bitcost = MAX_INT; + double bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); @@ -2250,10 +2250,10 @@ void kvz_search_cu_smp(encoder_state_t * const state, // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + x, y, depth, + lcu, + inter_cost, + inter_bitcost); } else { *inter_cost += state->lambda_sqrt * smp_extra_bits; } diff --git a/src/search_inter.h b/src/search_inter.h index 8b4b16f2..bb9067c5 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -71,13 +71,13 @@ typedef double kvz_mvd_cost_func(const encoder_state_t *state, inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost); + double *bitcost); void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); void kvz_search_cu_smp(encoder_state_t * const state, int x, int y, @@ -85,7 +85,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); unsigned kvz_inter_satd_cost(const encoder_state_t* state, From 4b8d217f2dcc7dfba3e3abd17ca6c95013437d5b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 13 Dec 2021 12:23:16 +0200 Subject: [PATCH 53/85] Add new macro for potentially updating cabac context when obtaining the bit cost --- src/cabac.h | 8 +++++ src/sao.c | 50 +++++++++++++++-------------- src/search.c | 78 ++++++++++++++-------------------------------- src/search_intra.c | 27 +++++++--------- 4 files changed, 70 insertions(+), 93 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index 59fb448c..8f0c7c70 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -131,6 +131,14 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol extern const float kvz_f_entropy_bits[128]; #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] + +#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ + (bits) += kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]; \ + if((cabac)->update) {\ + (cabac)->cur_ctx = ctx;\ + CABAC_BIN((cabac), (val), (name));\ + } \ +} while(0) extern double bits_written; // Macros diff --git a/src/sao.c b/src/sao.c index 35be7176..179f4311 100644 --- a/src/sao.c +++ b/src/sao.c @@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) { } -static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) +static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, none = 0 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type"); return mode_bits; } -static float sao_mode_bits_merge(const encoder_state_t * const state, +static double sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag"); if (merge_cand == 1) return mode_bits; - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag"); return mode_bits; } -static float sao_mode_bits_edge(const encoder_state_t * const state, +static double sao_mode_bits_edge(const encoder_state_t * const state, int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { - ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + ctx = &(cabac->ctx.sao_merge_flag_model); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, edge = 2 = cMax ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) { @@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state, } -static float sao_mode_bits_band(const encoder_state_t * const state, +static double sao_mode_bits_band(const encoder_state_t * const state, int band_position[2], int offsets[10], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded sao_type_idx_, band = 1 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets and possible FL coded offset signs. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) diff --git a/src/search.c b/src/search.c index 553c4380..43a07d4b 100644 --- a/src/search.c +++ b/src/search.c @@ -282,11 +282,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) { cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); - if (cabac->update) { - cabac->cur_ctx = ctx; - CABAC_BIN(cabac, tr_depth > 0, "tr_split_search"); - } + CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); *bit_cost += tr_tree_bits; } @@ -318,23 +314,16 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, // However, if we have different tr_depth, the bits cannot be written in correct // order anyways so do not touch the chroma cbf here. if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); cabac->cur_ctx = cr_ctx; int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); - tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, u_is_set); - CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); - tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, v_is_set); - CABAC_BIN(cabac, v_is_set, "cbf_cr_search"); + CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } } - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); *bit_cost += tr_tree_bits; - if(cabac->update) { - cabac->cur_ctx = ctx; - CABAC_BIN(cabac, is_set, "luma_cbf_search"); - } - } // SSD between reconstruction and original @@ -389,13 +378,11 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, cabac->cur_ctx = ctx; if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, u_is_set); - if(state->search_cabac.update) CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); } if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, v_is_set); - if (state->search_cabac.update) CABAC_BIN(cabac, v_is_set, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } *bit_cost += tr_tree_bits; } @@ -638,7 +625,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); } } - cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 0) * state->lambda; + double pred_mode_type_bits = 0; + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 0, pred_mode_type_bits, "pred_mode_flag"); + cost += pred_mode_type_bits * state->lambda; } } @@ -666,7 +655,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_cost); if(state->frame->slicetype != KVZ_SLICE_I) { - intra_cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 1) * state->lambda; + double pred_mode_type_bits = 0; + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); + intra_cost += pred_mode_type_bits * state->lambda; } if (intra_cost < cost) { cost = intra_cost; @@ -754,26 +745,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if(depth < MAX_DEPTH) { uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - state->search_cabac.cur_ctx = ctx; - bits += CTX_ENTROPY_FBITS(ctx, 0); - CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); } else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); } double mode_bits; if (cur_cu->type == CU_INTRA) { if(state->frame->slicetype != KVZ_SLICE_I) { cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); - bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, 1, "pred_mode"); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, bits, "pred_mode_flag"); } mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); } @@ -832,25 +816,22 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); state->search_cabac.update = 1; + double split_bits = 0; + if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, split_bits, "split_search"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); - split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 0, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } state->search_cabac.update = 0; + split_cost += split_bits * state->lambda; // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting @@ -887,17 +868,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (depth < MAX_DEPTH) { uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - state->search_cabac.cur_ctx = ctx; - bits += CTX_ENTROPY_FBITS(ctx, 0); - CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); } else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, bits, "no_split_search"); } cur_cu->intra = cu_d1->intra; @@ -915,7 +891,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode, mode_chroma, NULL, lcu); - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); + double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y) + bits; cost += mode_bits * state->lambda; cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); @@ -924,12 +900,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } FILE_BITS(bits, x, y, depth, "merged intra bits"); - // Add the cost of coding no-split. - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - - // Add the cost of coding intra mode only once. memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); diff --git a/src/search_intra.c b/src/search_intra.c index ac72bd44..e29f29a3 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -270,7 +270,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // the normal recursion in the cost functions. if (depth >= 1 && depth <= 3) { const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); - tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split"); *bit_cost += tr_split_bit; } @@ -285,10 +285,10 @@ static double search_intra_trdepth(encoder_state_t * const state, const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); } if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); } *bit_cost += cbf_bits; } @@ -650,7 +650,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) { cabac_data_t* cabac = &state->search_cabac; - double mode_bits; + double mode_bits = 0; bool mode_in_preds = false; for (int i = 0; i < 3; ++i) { @@ -660,10 +660,8 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t } const cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); - mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds); + CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search"); if (state->search_cabac.update) { - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, mode_in_preds, "prev_intra_luma_pred_flag_search"); if(mode_in_preds) { CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); if(luma_mode != intra_preds[0]) { @@ -689,17 +687,16 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { - cabac_data_t* cabac = &state->search_cabac; + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); - double mode_bits; - if (chroma_mode == luma_mode) { - mode_bits = CTX_ENTROPY_FBITS(ctx, 0); - } else { - mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1); + + double mode_bits = 0; + CABAC_FBITS_UPDATE(cabac, ctx, chroma_mode != luma_mode, mode_bits, "intra_chroma_pred_mode"); + if (chroma_mode != luma_mode) { + mode_bits += 2.0; } + if(cabac->update) { - cabac->cur_ctx = ctx; - CABAC_BIN(cabac, chroma_mode != luma_mode, "intra_chroma_pred_mode"); if(chroma_mode != luma_mode) { // Again it does not matter what we actually write here CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode"); From aea1133e6a48715be04d738e500aaccd6d0b871e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 08:40:23 +0200 Subject: [PATCH 54/85] Function for mock coding a CU and counting the bits --- src/cabac.c | 3 +- src/cabac.h | 4 +- src/encode_coding_tree.c | 228 +++++++++++++++++++++++++++++++-------- src/encode_coding_tree.h | 19 +++- src/rdo.c | 20 ++-- src/rdo.h | 8 +- src/search.c | 56 +++++++--- src/search_inter.c | 8 +- 8 files changed, 261 insertions(+), 85 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index 5842edbe..ed480e17 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -547,7 +547,7 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int /** * \brief */ -void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, +uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, cabac_data_t * const data, uint32_t symbol, uint32_t count) @@ -576,4 +576,5 @@ void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, } } kvz_cabac_encode_bins_ep(data, bins, num_bins); + return num_bins; } diff --git a/src/cabac.h b/src/cabac.h index 8f0c7c70..62d59d9e 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -122,7 +122,7 @@ void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol, uint32_t r_param); void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac, const uint32_t symbol, const uint32_t r_param, int32_t base_level); -void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, +uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, uint32_t symbol, uint32_t count); void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, uint32_t symbol, int32_t offset, @@ -133,7 +133,7 @@ extern const float kvz_f_entropy_bits[128]; #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] #define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ - (bits) += kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]; \ + if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]; \ if((cabac)->update) {\ (cabac)->cur_ctx = ctx;\ CABAC_BIN((cabac), (val), (name));\ diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 76f0cc7e..a847640e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -309,15 +309,17 @@ static void encode_transform_coeff(encoder_state_t * const state, } void kvz_encode_inter_prediction_unit(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int width, int height, - int depth, lcu_t* lcu) + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + int x, int y, int width, int height, + int depth, lcu_t* lcu, double* bits_out) { // Mergeflag int16_t num_cand = 0; - cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, cur_cu->merged, "MergeFlag"); + double bits = 0; + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag"); + num_cand = state->encoder_control->cfg.max_merge; if (cur_cu->merged) { //merge if (num_cand > 1) { @@ -325,10 +327,10 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != cur_cu->merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac,symbol,"MergeIndex"); + if(cabac->only_count) bits += 1; } if (symbol == 0) break; } @@ -339,12 +341,10 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, uint8_t inter_dir = cur_cu->inter.mv_dir-1; if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8) { - cabac->cur_ctx = &(cabac->ctx.inter_dir[depth]); - CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[depth]), inter_dir == 2, bits, "inter_pred_idc"); } if (inter_dir < 2) { - cabac->cur_ctx = &(cabac->ctx.inter_dir[4]); - CABAC_BIN(cabac, inter_dir, "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[4]), inter_dir, bits, "inter_pred_idc"); } } @@ -359,9 +359,8 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, if (ref_LX_size > 1) { // parseRefFrmIdx int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0) { ref_frame--; @@ -373,9 +372,10 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, if (i == 0) { cabac->cur_ctx = &cabac->ctx.cu_ref_pic_model[1]; - CABAC_BIN(cabac, symbol, "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_ref_pic_model[1], symbol, bits, "ref_idx_lX"); } else { CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); + if (cabac->only_count) bits += 1; } if (symbol == 0) break; } @@ -404,7 +404,7 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, const int32_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; const int32_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; - kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver); + kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out); } // Signal which candidate MV to use @@ -416,6 +416,7 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, } // for ref_list } // if !merge + if(bits_out) *bits_out += bits; } @@ -466,7 +467,7 @@ static INLINE uint8_t intra_mode_encryption(encoder_state_t * const state, static void encode_intra_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth) + int x, int y, int depth, double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual[4]; @@ -569,18 +570,19 @@ static void encode_intra_coding_unit(encoder_state_t * const state, } #endif } - - cabac->cur_ctx = &(cabac->ctx.intra_mode_model); + for (int j = 0; j < num_pred_units; ++j) { - CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_mode_model),flag[j], *bits_out, "prev_intra_luma_pred_flag"); } for (int j = 0; j < num_pred_units; ++j) { // Signal index of the prediction mode in the prediction list. if (flag[j]) { CABAC_BIN_EP(cabac, (mpm_preds[j] == 0 ? 0 : 1), "mpm_idx"); + if (cabac->only_count) *bits_out += 1; if (mpm_preds[j] != 0) { CABAC_BIN_EP(cabac, (mpm_preds[j] == 1 ? 0 : 1), "mpm_idx"); + if (cabac->only_count) *bits_out += 1; } } else { // Signal the actual prediction mode. @@ -599,6 +601,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state, } CABAC_BINS_EP(cabac, tmp_pred, 5, "rem_intra_luma_pred_mode"); + if (cabac->only_count) *bits_out += 5; } } @@ -639,17 +642,21 @@ static void encode_intra_coding_unit(encoder_state_t * const state, */ cabac->cur_ctx = &(cabac->ctx.chroma_pred_model[0]); if (pred_mode == 4) { - CABAC_BIN(cabac, 0, "intra_chroma_pred_mode"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.chroma_pred_model[0]), 0, *bits_out,"intra_chroma_pred_mode"); } else { - CABAC_BIN(cabac, 1, "intra_chroma_pred_mode"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.chroma_pred_model[0]), 1, *bits_out,"intra_chroma_pred_mode"); CABAC_BINS_EP(cabac, pred_mode, 2, "intra_chroma_pred_mode"); + if (cabac->only_count) *bits_out += 2; } } - encode_transform_coeff(state, x, y, depth, 0, 0, 0); + // if we are counting bits, the cost for transform coeffs is done separately + // To get the distortion at the same time + if(!cabac->only_count) + encode_transform_coeff(state, x, y, depth, 0, 0, 0); } -static void encode_part_mode(encoder_state_t * const state, +static double encode_part_mode(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int depth) @@ -684,32 +691,32 @@ static void encode_part_mode(encoder_state_t * const state, // log2CbSize == MinCbLog2SizeY | 0 1 2 bypass // log2CbSize > MinCbLog2SizeY | 0 1 3 bypass // ------------------------------+------------------ - + double bits = 0; if (cur_cu->type == CU_INTRA) { if (depth == MAX_DEPTH) { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); } else { - CABAC_BIN(cabac, 0, "part_mode NxN"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN"); } } } else { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode split"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split"); cabac->cur_ctx = &(cabac->ctx.part_size_model[1]); if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_2NxnD) { - CABAC_BIN(cabac, 1, "part_mode vertical"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical"); } else { - CABAC_BIN(cabac, 0, "part_mode horizontal"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal"); } if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) { @@ -717,19 +724,22 @@ static void encode_part_mode(encoder_state_t * const state, if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_Nx2N) { - CABAC_BIN(cabac, 1, "part_mode SMP"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode AMP"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP"); if (cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_nLx2N) { CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } else { CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } } } + return bits; } void kvz_encode_coding_tree(encoder_state_t * const state, @@ -865,7 +875,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL); + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL); } { @@ -883,7 +893,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); } #if ENABLE_PCM @@ -942,11 +952,135 @@ end: } +void kvz_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu) { + double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; + + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + + const int cu_width = LCU_WIDTH >> depth; + const int half_cu = cu_width >> 1; + + const cu_info_t* cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const cu_info_t* left_cu = NULL, *above_cu = NULL; + if (x) { + left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + if (y) { + above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1); + } + uint8_t split_model = 0; + + // Absolute coordinates + uint16_t abs_x = x + state->tile->offset_x; + uint16_t abs_y = y + state->tile->offset_y; + + // Check for slice border + bool border_x = ctrl->in.width < abs_x + cu_width; + bool border_y = ctrl->in.height < abs_y + cu_width; + bool border = border_x || border_y; /*!< are we in any border CU */ + + if (depth <= state->frame->max_qp_delta_depth) { + state->must_code_qp_delta = true; + } + + // When not in MAX_DEPTH, insert split flag and split the blocks if needed + if (depth != MAX_DEPTH) { + // Implicit split flag when on border + if (!border) { + // Get left and top block split_flags and if they are present and true, increase model number + if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { + split_model++; + } + + if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { + split_model++; + } + + // This mocks encoding the current CU so it should be never split + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), 0, bits, "SplitFlag"); + } + } + + // Encode skip flag + if (state->frame->slicetype != KVZ_SLICE_I) { + int8_t ctx_skip = 0; + + if (left_cu && left_cu->skipped) { + ctx_skip++; + } + if (above_cu && above_cu->skipped) { + ctx_skip++; + } + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag"); + + if (cur_cu->skipped) { + int16_t num_cand = state->encoder_control->cfg.max_merge; + if (num_cand > 1) { + for (int ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != cur_cu->merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); + } + else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + if(cabac->only_count) bits += 1; + } + if (symbol == 0) { + break; + } + } + } + return; + } + } + // Prediction mode + if (state->frame->slicetype != KVZ_SLICE_I) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model), (cur_cu->type == CU_INTRA), bits, "PredMode"); + } + + // part_mode + bits += encode_part_mode(state, cabac, cur_cu, depth); + + if (cur_cu->type == CU_INTER) { + const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; + + for (int i = 0; i < num_pu; ++i) { + const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); + const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); + const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); + const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); + const cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits); + } + + { + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + // Only need to signal coded block flag if not skipped or merged + // skip = no coded residual, merge = coded residual + if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, bits, "rqt_root_cbf"); + } + + } + } + else if (cur_cu->type == CU_INTRA) { + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); + } +} + void kvz_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver) + int32_t mvd_ver, double* bits_out) { const int8_t hor_abs_gr0 = mvd_hor != 0; const int8_t ver_abs_gr0 = mvd_ver != 0; @@ -954,20 +1088,21 @@ void kvz_encode_mvd(encoder_state_t * const state, const uint32_t mvd_ver_abs = abs(mvd_ver); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0]; - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver"); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1]; if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor"); } if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver"); } if (hor_abs_gr0) { if (mvd_hor_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + if(cabac->only_count) *bits_out += bits; } uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1; if (!state->cabac.only_count && @@ -976,10 +1111,12 @@ void kvz_encode_mvd(encoder_state_t * const state, mvd_hor_sign = mvd_hor_sign ^ kvz_crypto_get_key(state->crypto_hdl, 1); } CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); + if (cabac->only_count) *bits_out += 1; } if (ver_abs_gr0) { if (mvd_ver_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + if (cabac->only_count) *bits_out += bits; } uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1; if (!state->cabac.only_count && @@ -988,5 +1125,6 @@ void kvz_encode_mvd(encoder_state_t * const state, mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1); } CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); + if (cabac->only_count) *bits_out += 1; } } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index a3f95b36..b8e29358 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -49,13 +49,22 @@ void kvz_encode_coding_tree(encoder_state_t * const state, void kvz_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver); + int32_t mvd_ver, + double* bits_out); + +void kvz_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu); void kvz_encode_inter_prediction_unit(encoder_state_t* const state, - cabac_data_t* const cabac, - const cu_info_t* const cur_cu, - int x, int y, int width, int height, - int depth, lcu_t* lcu); + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int width, int height, + int depth, + lcu_t* lcu, + double* bits_out); void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, diff --git a/src/rdo.c b/src/rdo.c index 6b8960ee..5b6c3b49 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1010,22 +1010,18 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, /** * Calculate cost of actual motion vectors using CABAC coding */ -uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { cabac_data_t cabac_copy = *cabac; cabac_copy.only_count = 1; - + double bits = 0; // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver); + kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits); - uint32_t bitcost = - ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) - - ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)); - - return bitcost; + return bits; } /** MVD cost calculation with CABAC @@ -1160,7 +1156,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y); + kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, NULL); } // Signal which candidate MV to use diff --git a/src/rdo.h b/src/rdo.h index dd75fdb9..23453eee 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -71,10 +71,10 @@ uint32_t kvz_get_coded_level(encoder_state_t * state, double* coded_cost, double kvz_mvd_cost_func kvz_calc_mvd_cost_cabac; -uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - int32_t mvd_hor, - int32_t mvd_ver); +double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + int32_t mvd_hor, + int32_t mvd_ver); // Number of fixed point fractional bits used in the fractional bit table. #define CTX_FRAC_BITS 15 diff --git a/src/search.c b/src/search.c index 43a07d4b..ad24b501 100644 --- a/src/search.c +++ b/src/search.c @@ -740,29 +740,61 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { double bits = 0; - state->search_cabac.update = 1; + cabac_data_t* cabac = &state->search_cabac; + cabac->update = 1; if(depth < MAX_DEPTH) { uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); + cabac_ctx_t* ctx = &(cabac->ctx.split_flag_model[split_model]); + CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); } else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. - cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); + cabac_ctx_t* ctx = &(cabac->ctx.part_size_model[0]); + CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); } - double mode_bits; + double mode_bits = 0; + if (state->frame->slicetype != KVZ_SLICE_I) { + int ctx_skip = 0; + if (x > 0) { + ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->skipped; + } + if (y > 0) { + ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->skipped; + } + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, mode_bits, "skip_flag"); + if (cur_cu->skipped) { + int16_t num_cand = state->encoder_control->cfg.max_merge; + if (num_cand > 1) { + for (int ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != cur_cu->merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, mode_bits, "MergeIndex"); + } + else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + mode_bits += 1; + } + if (symbol == 0) { + break; + } + } + } + } + + } if (cur_cu->type == CU_INTRA) { if(state->frame->slicetype != KVZ_SLICE_I) { - cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); - CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, bits, "pred_mode_flag"); + cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "pred_mode_flag"); } - mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); + mode_bits += calc_mode_bits(state, lcu, cur_cu, x, y); } - else { - mode_bits = inter_bitcost; + else if (!cur_cu->skipped) { + cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "pred_mode_flag"); + mode_bits += inter_bitcost; } bits += mode_bits; cost = mode_bits * state->lambda; @@ -795,7 +827,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->cbf = 0; lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } - state->search_cabac.update = 0; + cabac->update = 0; } bool can_split_cu = diff --git a/src/search_inter.c b/src/search_inter.c index 983ffcc8..a4c75d9e 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -323,19 +323,19 @@ static void select_starting_point(inter_search_info_t *info, } -static uint32_t get_mvd_coding_cost(const encoder_state_t *state, +static double get_mvd_coding_cost(const encoder_state_t *state, const cabac_data_t* cabac, const int32_t mvd_hor, const int32_t mvd_ver) { - unsigned bitcost = 0; + double bitcost = 0; const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; // Round and shift back to integer bits. - return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; + return bitcost / (1 << CTX_FRAC_BITS); } @@ -353,7 +353,7 @@ static int select_mv_cand(const encoder_state_t *state, return 0; } - uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, + double (*mvd_coding_cost)(const encoder_state_t * const state, const cabac_data_t*, int32_t, int32_t); if (state->encoder_control->cfg.mv_rdo) { From 64b2806cc818f029b6387b5e868b79b06fb1cd6e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 11:26:45 +0200 Subject: [PATCH 55/85] Add couple of missing bits to the calculation and get intra neighbours from lcu rather than cu_array --- src/cabac.c | 18 ++++++----- src/cabac.h | 4 +-- src/encode_coding_tree.c | 27 ++++++++--------- src/encode_coding_tree.h | 4 +-- src/rdo.c | 15 +++++----- src/search.c | 65 ++++++++-------------------------------- 6 files changed, 47 insertions(+), 86 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index ed480e17..36931277 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -491,26 +491,28 @@ void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, ca /** * \brief */ -void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol) +void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, + cabac_ctx_t * const ctx, + uint32_t symbol, + const int32_t offset, + const uint32_t max_symbol, + double* bits_out) { int8_t code_last = max_symbol > symbol; assert(symbol <= max_symbol); if (!max_symbol) return; - - data->cur_ctx = &ctx[0]; - CABAC_BIN(data, symbol, "ums"); + + CABAC_FBITS_UPDATE(data, &ctx[0], symbol, *bits_out, "ums"); if (!symbol) return; while (--symbol) { - data->cur_ctx = &ctx[offset]; - CABAC_BIN(data, 1, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums"); } if (code_last) { - data->cur_ctx = &ctx[offset]; - CABAC_BIN(data, 0, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums"); } } diff --git a/src/cabac.h b/src/cabac.h index 62d59d9e..f9190045 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -125,8 +125,8 @@ void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, ca uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, uint32_t symbol, uint32_t count); void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, - uint32_t symbol, int32_t offset, - uint32_t max_symbol); + uint32_t symbol, int32_t offset, + uint32_t max_symbol, double* bits_out); void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol); extern const float kvz_f_entropy_bits[128]; diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index a847640e..b25494f4 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -290,7 +290,7 @@ static void encode_transform_coeff(encoder_state_t * const state, // cu_qp_delta_abs prefix cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0]; - kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5); + kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL); if (qp_delta_abs >= 5) { // cu_qp_delta_abs suffix @@ -412,7 +412,7 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, cabac->ctx.mvp_idx_model, CU_GET_MV_CAND(cur_cu, ref_list_idx), 1, - AMVP_MAX_NUM_CANDS - 1); + AMVP_MAX_NUM_CANDS - 1, bits_out); } // for ref_list } // if !merge @@ -467,7 +467,7 @@ static INLINE uint8_t intra_mode_encryption(encoder_state_t * const state, static void encode_intra_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth, double* bits_out) + int x, int y, int depth, lcu_t* lcu, double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual[4]; @@ -506,19 +506,19 @@ static void encode_intra_coding_unit(encoder_state_t * const state, for (int j = 0; j < num_pred_units; ++j) { const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j); const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j); - const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); + const cu_info_t *cur_pu = lcu ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)) : kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); const cu_info_t *left_pu = NULL; const cu_info_t *above_pu = NULL; if (pu_x > 0) { assert(pu_x >> 2 > 0); - left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y); + left_pu = lcu ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x -1), SUB_SCU(pu_y)) : kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y); } // Don't take the above PU across the LCU boundary. if (pu_y % LCU_WIDTH > 0 && pu_y > 0) { assert(pu_y >> 2 > 0); - above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1); + above_pu = lcu ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y - 1)) : kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1); } if (do_crypto) { @@ -893,7 +893,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); } #if ENABLE_PCM @@ -952,11 +952,11 @@ end: } -void kvz_mock_encode_coding_unit( +double kvz_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, int x, int y, int depth, - lcu_t* lcu) { + lcu_t* lcu, cu_info_t* cur_cu) { double bits = 0; const encoder_control_t* const ctrl = state->encoder_control; @@ -964,9 +964,7 @@ void kvz_mock_encode_coding_unit( int y_local = SUB_SCU(y); const int cu_width = LCU_WIDTH >> depth; - const int half_cu = cu_width >> 1; - - const cu_info_t* cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const cu_info_t* left_cu = NULL, *above_cu = NULL; if (x) { left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); @@ -1037,7 +1035,7 @@ void kvz_mock_encode_coding_unit( } } } - return; + return bits; } } // Prediction mode @@ -1072,8 +1070,9 @@ void kvz_mock_encode_coding_unit( } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); } + return bits; } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index b8e29358..42a1a981 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -52,11 +52,11 @@ void kvz_encode_mvd(encoder_state_t * const state, int32_t mvd_ver, double* bits_out); -void kvz_mock_encode_coding_unit( +double kvz_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, int x, int y, int depth, - lcu_t* lcu); + lcu_t* lcu, cu_info_t* cur_cu); void kvz_encode_inter_prediction_unit(encoder_state_t* const state, cabac_data_t* const cabac, diff --git a/src/rdo.c b/src/rdo.c index 5b6c3b49..fc0b2198 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1081,8 +1081,8 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, x - mv_cand[1][0], y - mv_cand[1][1], }; - uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); - uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); + double cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); + double cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { @@ -1161,11 +1161,12 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // Signal which candidate MV to use kvz_cabac_write_unary_max_symbol( - cabac, - cabac->ctx.mvp_idx_model, - cur_mv_cand, - 1, - AMVP_MAX_NUM_CANDS - 1); + cabac, + cabac->ctx.mvp_idx_model, + cur_mv_cand, + 1, + AMVP_MAX_NUM_CANDS - 1, + NULL); } } } diff --git a/src/search.c b/src/search.c index ad24b501..1fc36566 100644 --- a/src/search.c +++ b/src/search.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "imagelist.h" #include "inter.h" #include "intra.h" @@ -743,61 +744,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; - if(depth < MAX_DEPTH) { - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(cabac->ctx.split_flag_model[split_model]); - CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); + if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) { + bits += kvz_mock_encode_coding_unit( + state, + cabac, + x, y, depth, + lcu, + cur_cu); } - else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { - // Add cost of intra part_size. - cabac_ctx_t* ctx = &(cabac->ctx.part_size_model[0]); - CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); + else { + // Intra 4×4 PUs } - - double mode_bits = 0; - if (state->frame->slicetype != KVZ_SLICE_I) { - int ctx_skip = 0; - if (x > 0) { - ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->skipped; - } - if (y > 0) { - ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->skipped; - } - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, mode_bits, "skip_flag"); - if (cur_cu->skipped) { - int16_t num_cand = state->encoder_control->cfg.max_merge; - if (num_cand > 1) { - for (int ui = 0; ui < num_cand - 1; ui++) { - int32_t symbol = (ui != cur_cu->merge_idx); - if (ui == 0) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, mode_bits, "MergeIndex"); - } - else { - CABAC_BIN_EP(cabac, symbol, "MergeIndex"); - mode_bits += 1; - } - if (symbol == 0) { - break; - } - } - } - } - - } - if (cur_cu->type == CU_INTRA) { - if(state->frame->slicetype != KVZ_SLICE_I) { - cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); - CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "pred_mode_flag"); - } - mode_bits += calc_mode_bits(state, lcu, cur_cu, x, y); - } - else if (!cur_cu->skipped) { - cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); - CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "pred_mode_flag"); - mode_bits += inter_bitcost; - } - bits += mode_bits; - cost = mode_bits * state->lambda; + + cost = bits * state->lambda; cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (state->encoder_control->chroma_format != KVZ_CSP_400) { From 951a845f086232c5114b11244c183065edeab87d Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 11:48:59 +0200 Subject: [PATCH 56/85] Remove consts and fix wrong types --- src/rdo.c | 5 ++--- src/sao.c | 14 +++++++------- src/search_intra.c | 12 ++++++------ 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index fc0b2198..04b9aca9 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -271,8 +271,7 @@ static INLINE uint32_t get_coeff_cabac_cost( scan_mode, 0); if(cabac_copy.update) { - - memcpy(&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); + memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); } return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3); } @@ -1036,7 +1035,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost) + double* bitcost) { cabac_data_t state_cabac_copy; cabac_data_t* cabac; diff --git a/src/sao.c b/src/sao.c index 179f4311..b7d76e64 100644 --- a/src/sao.c +++ b/src/sao.c @@ -53,7 +53,7 @@ static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t { double mode_bits = 0.0; cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); @@ -74,8 +74,8 @@ static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t static double sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { double mode_bits = 0.0; - cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); @@ -91,8 +91,8 @@ static double sao_mode_bits_edge(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { double mode_bits = 0.0; - cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); @@ -132,8 +132,8 @@ static double sao_mode_bits_band(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { double mode_bits = 0.0; - cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); diff --git a/src/search_intra.c b/src/search_intra.c index e29f29a3..2986f67f 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -269,7 +269,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // Add bits for split_transform_flag = 1, because transform depth search bypasses // the normal recursion in the cost functions. if (depth >= 1 && depth <= 3) { - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split"); *bit_cost += tr_split_bit; } @@ -283,7 +283,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (state->encoder_control->chroma_format != KVZ_CSP_400) { const uint8_t tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); } @@ -647,9 +647,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } -double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) +double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) { - cabac_data_t* cabac = &state->search_cabac; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; double mode_bits = 0; bool mode_in_preds = false; @@ -659,7 +659,7 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t } } - const cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); + cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search"); if (state->search_cabac.update) { if(mode_in_preds) { @@ -688,7 +688,7 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); + cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); double mode_bits = 0; CABAC_FBITS_UPDATE(cabac, ctx, chroma_mode != luma_mode, mode_bits, "intra_chroma_pred_mode"); From a038ccc19ae74b4287204b6d9e95a3847a8e0b42 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 13:16:48 +0200 Subject: [PATCH 57/85] =?UTF-8?q?add=20back=20bitcost=20for=204=C3=974=20i?= =?UTF-8?q?ntra=20PUs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/search.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/search.c b/src/search.c index 1fc36566..def91fcb 100644 --- a/src/search.c +++ b/src/search.c @@ -754,6 +754,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else { // Intra 4×4 PUs + if (state->frame->slicetype != KVZ_SLICE_I) { + cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); + CABAC_FBITS_UPDATE(cabac, ctx, 1, bits, "pred_mode_flag"); + } + bits += calc_mode_bits(state, lcu, cur_cu, x, y); } cost = bits * state->lambda; From 243e45f07e9696f9ee5515be385d1af668bcf9d3 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 20 Dec 2021 09:36:23 +0200 Subject: [PATCH 58/85] accurate bit cost calculation when using transform skip --- src/cabac.c | 6 +- src/cabac.h | 1 - src/encoderstate.c | 2 + src/search.c | 171 +++++++++++++++++++++++++++++++++++++++------ 4 files changed, 153 insertions(+), 27 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index 36931277..ae31fb0b 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -106,8 +106,8 @@ void kvz_cabac_start(cabac_data_t * const data) void kvz_cabac_encode_bin(cabac_data_t * const data, const uint32_t bin_value) { uint32_t lps; - - + + if (!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (bin_value)); lps = kvz_g_auc_lpst_table[CTX_STATE(data->cur_ctx)][(data->range >> 6) & 3]; data->range -= lps; @@ -577,6 +577,6 @@ uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, bins = ( (bins >> (num_bins >>1) ) << (num_bins >>1) ) | state->crypto_prev_pos; } } - kvz_cabac_encode_bins_ep(data, bins, num_bins); + CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb"); return num_bins; } diff --git a/src/cabac.h b/src/cabac.h index f9190045..6c46011b 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -156,7 +156,6 @@ extern double bits_written; #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ - if(!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (value));\ kvz_cabac_encode_bin((data), (value)); \ if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u bits = %f\n", \ (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx), bits_written); } diff --git a/src/encoderstate.c b/src/encoderstate.c index 012476df..d02ca483 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -685,6 +685,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) const uint64_t existing_bits = kvz_bitstream_tell(&state->stream); //Encode SAO + state->cabac.update = 1; if (encoder->cfg.sao_type) { encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]); } @@ -737,6 +738,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) kvz_crypto_delete(&state->crypto_hdl); } } + state->cabac.update = 0; pthread_mutex_lock(&state->frame->rc_lock); const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits; diff --git a/src/search.c b/src/search.c index def91fcb..461eae4e 100644 --- a/src/search.c +++ b/src/search.c @@ -299,30 +299,34 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, return sum + tr_tree_bits * state->lambda; } + + if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { + // Because these need to be coded before the luma cbf they also need to be counted + // before the cabac state changes. However, since this branch is only executed when + // calculating the last RD cost it is not problem to include the chroma cbf costs in + // luma, because the chroma cost is calculated right after the luma cost. + // However, if we have different tr_depth, the bits cannot be written in correct + // order anyways so do not touch the chroma cbf here. + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]); + cabac->cur_ctx = cr_ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); + } + } + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = tr_cu->tr_depth - tr_cu->depth; if (pred_cu->type == CU_INTRA || - tr_depth > 0 || + is_tr_split || cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { - // Because these need to be coded before the luma cbf they also need to be counted - // before the cabac state changes. However, since this branch is only executed when - // calculating the last RD cost it is not problem to include the chroma cbf costs in - // luma, because the chroma cost is calculated right after the luma cost. - // However, if we have different tr_depth, the bits cannot be written in correct - // order anyways so do not touch the chroma cbf here. - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); - cabac->cur_ctx = cr_ctx; - int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); - int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); - CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); - CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); - } - } + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); *bit_cost += tr_tree_bits; } @@ -390,7 +394,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); - int sum = 0; + double sum = 0; sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); @@ -426,6 +430,126 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, return (double)ssd * CHROMA_MULT + bits * state->lambda; } +static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, + const int x_px, const int y_px, const int depth, + const cu_info_t* const pred_cu, + lcu_t* const lcu, + double* bit_cost) { + const int width = LCU_WIDTH >> depth; + + // cur_cu is used for TU parameters. + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + + double coeff_bits = 0; + double tr_tree_bits = 0; + + // Check that lcu is not in + assert(x_px >= 0 && x_px < LCU_WIDTH); + assert(y_px >= 0 && y_px < LCU_WIDTH); + + const uint8_t tr_depth = tr_cu->tr_depth - depth; + + const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); + + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + + // Add transform_tree split_transform_flag bit cost. + bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; + int max_tr_depth; + if (pred_cu->type == CU_INTRA) { + max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag; + } + else { + max_tr_depth = state->encoder_control->tr_depth_inter; + } + if (width <= TR_MAX_WIDTH + && width > TR_MIN_WIDTH + && !intra_split_flag + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) + { + cabac_ctx_t* ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); + CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); + } + + if(state->encoder_control->chroma_format != KVZ_CSP_400) { + if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_U)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_u, tr_tree_bits, "cbf_cb"); + } + if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_V)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_v, tr_tree_bits, "cbf_cr"); + } + } + + if (tr_depth > 0) { + int offset = LCU_WIDTH >> (depth + 1); + double sum = 0; + *bit_cost += tr_tree_bits; + + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + return sum + tr_tree_bits * state->lambda; + } + const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ; + + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = depth - tr_cu->depth; + if (pred_cu->type == CU_INTRA || + is_tr_split || + cb_flag_u || + cb_flag_v) + { + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); + + CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); + } + *bit_cost += tr_tree_bits; + // SSD between reconstruction and original + unsigned luma_ssd = 0; + if (!state->encoder_control->cfg.lossless) { + int index = y_px * LCU_WIDTH + x_px; + luma_ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width); + } + + { + int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); + const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + + coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode); + } + + unsigned chroma_ssd = 0; + if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) { + const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; + const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + if (!state->encoder_control->cfg.lossless) { + int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + chroma_ssd = ssd_u + ssd_v; + } + + { + int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order); + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order); + } + } + *bit_cost += coeff_bits; + double bits = tr_tree_bits + coeff_bits; + return luma_ssd * LUMA_MULT + chroma_ssd * CHROMA_MULT + bits * state->lambda; +} + // Return estimate of bits used to code prediction mode of cur_cu. static double calc_mode_bits(const encoder_state_t *state, @@ -763,10 +887,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost = bits * state->lambda; - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); - } + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, &bits); + //if (state->encoder_control->chroma_format != KVZ_CSP_400) { + // cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); + //} FILE_BITS(bits, x, y, depth, "final rd bits"); @@ -826,6 +950,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } + FILE_BITS(split_bits, x, y, depth, "split"); state->search_cabac.update = 0; split_cost += split_bits * state->lambda; From f83e21735ce602f2672cc4fa51a2aaf8b8294e92 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 20 Dec 2021 10:44:19 +0200 Subject: [PATCH 59/85] Fix couple of mistakes --- src/search.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/search.c b/src/search.c index 461eae4e..2cf9dae6 100644 --- a/src/search.c +++ b/src/search.c @@ -437,6 +437,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, double* bit_cost) { const int width = LCU_WIDTH >> depth; + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); // cur_cu is used for TU parameters. cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -466,17 +467,18 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag - && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth + && !skip_residual_coding) { cabac_ctx_t* ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); } - if(state->encoder_control->chroma_format != KVZ_CSP_400) { - if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_U)) { + if(state->encoder_control->chroma_format != KVZ_CSP_400 && !skip_residual_coding) { + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_u, tr_tree_bits, "cbf_cb"); } - if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_V)) { + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_v, tr_tree_bits, "cbf_cr"); } } @@ -496,10 +498,11 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, // Add transform_tree cbf_luma bit cost. const int is_tr_split = depth - tr_cu->depth; - if (pred_cu->type == CU_INTRA || + if ((pred_cu->type == CU_INTRA || is_tr_split || cb_flag_u || - cb_flag_v) + cb_flag_v) + && !skip_residual_coding) { cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); From 5ed1ffb5d4b1e4036ebc45736c4195edfcd53711 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 21 Dec 2021 17:04:47 +0200 Subject: [PATCH 60/85] WIP: pre Christmas --- src/encode_coding_tree.c | 6 +++--- src/encode_coding_tree.h | 5 +++++ src/rdo.c | 28 +++++++++++++--------------- src/search.c | 2 +- src/search_inter.c | 26 +++++++++++++------------- 5 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index b25494f4..ffd8ae1e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -656,7 +656,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state, encode_transform_coeff(state, x, y, depth, 0, 0, 0); } -static double encode_part_mode(encoder_state_t * const state, +double kvz_encode_part_mode(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int depth) @@ -863,7 +863,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } // part_mode - encode_part_mode(state, cabac, cur_cu, depth); + kvz_encode_part_mode(state, cabac, cur_cu, depth); if (cur_cu->type == CU_INTER) { const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; @@ -1044,7 +1044,7 @@ double kvz_mock_encode_coding_unit( } // part_mode - bits += encode_part_mode(state, cabac, cur_cu, depth); + bits += kvz_encode_part_mode(state, cabac, cur_cu, depth); if (cur_cu->type == CU_INTER) { const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 42a1a981..d189e6e0 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -58,6 +58,11 @@ double kvz_mock_encode_coding_unit( int x, int y, int depth, lcu_t* lcu, cu_info_t* cur_cu); +double kvz_encode_part_mode(encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int depth); + void kvz_encode_inter_prediction_unit(encoder_state_t* const state, cabac_data_t* const cabac, const cu_info_t* const cur_cu, diff --git a/src/rdo.c b/src/rdo.c index 04b9aca9..e8805494 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1062,14 +1062,13 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, } // Store cabac state and contexts - memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t)); + memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t)); // Clear bytes and bits and set mode to "count" state_cabac_copy.only_count = 1; - state_cabac_copy.num_buffered_bytes = 0; - state_cabac_copy.bits_left = 23; cabac = &state_cabac_copy; + double bits = 0; if (!merged) { vector2d_t mvd1 = { @@ -1094,7 +1093,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, merged, "MergeFlag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag"); num_cand = state->encoder_control->cfg.max_merge; if (merged) { if (num_cand > 1) { @@ -1102,10 +1101,10 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + bits += 1; } if (symbol == 0) break; } @@ -1128,24 +1127,23 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, if (ref_list[ref_list_idx] > 1) { // parseRefFrmIdx int32_t ref_frame = ref_idx; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0) { int32_t i; int32_t ref_num = ref_list[ref_list_idx] - 2; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); + ref_frame--; for (i = 0; i < ref_num; ++i) { const uint32_t symbol = (i == ref_frame) ? 0 : 1; if (i == 0) { - CABAC_BIN(cabac, symbol, "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX"); } else { CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); + bits += 1; } if (symbol == 0) break; } @@ -1155,7 +1153,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, NULL); + kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits); } // Signal which candidate MV to use @@ -1165,12 +1163,12 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, cur_mv_cand, 1, AMVP_MAX_NUM_CANDS - 1, - NULL); + &bits); } } } - *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); + *bitcost = bits; // Store bitcost before restoring cabac return *bitcost * state->lambda_sqrt; diff --git a/src/search.c b/src/search.c index 2cf9dae6..ff116140 100644 --- a/src/search.c +++ b/src/search.c @@ -676,7 +676,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // Assign correct depth limit constraint_t* constr = state->constraint; - if(constr->ml_intra_depth_ctu) { + if(constr->ml_intra_depth_ctu) { pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8]; pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8]; } diff --git a/src/search_inter.c b/src/search_inter.c index a4c75d9e..93ef2333 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "image.h" #include "imagelist.h" #include "inter.h" @@ -343,7 +344,7 @@ static int select_mv_cand(const encoder_state_t *state, int16_t mv_cand[2][2], int32_t mv_x, int32_t mv_y, - uint32_t *cost_out) + double*cost_out) { const bool same_cand = (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]); @@ -362,12 +363,12 @@ static int select_mv_cand(const encoder_state_t *state, mvd_coding_cost = get_mvd_coding_cost; } - uint32_t cand1_cost = mvd_coding_cost( + double cand1_cost = mvd_coding_cost( state, &state->cabac, mv_x - mv_cand[0][0], mv_y - mv_cand[0][1]); - uint32_t cand2_cost; + double cand2_cost; if (same_cand) { cand2_cost = cand1_cost; } else { @@ -419,7 +420,7 @@ static double calc_mvd_cost(const encoder_state_t *state, // Check mvd cost only if mv is not merged if (!merged) { - uint32_t mvd_cost = 0; + double mvd_cost = 0; select_mv_cand(state, mv_cand, x, y, &mvd_cost); temp_bitcost += mvd_cost; } @@ -2165,7 +2166,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost = 0; *inter_bitcost = 0; - + for (int i = 0; i < num_pu; ++i) { const int x_pu = PU_GET_X(part_mode, width, x_local, i); const int y_pu = PU_GET_Y(part_mode, width, y_local, i); @@ -2233,14 +2234,13 @@ void kvz_search_cu_smp(encoder_state_t * const state, } } - // Count bits spent for coding the partition mode. - int smp_extra_bits = 1; // horizontal or vertical - if (state->encoder_control->cfg.amp_enable) { - smp_extra_bits += 1; // symmetric or asymmetric - if (part_mode != SIZE_2NxN && part_mode != SIZE_Nx2N) { - smp_extra_bits += 1; // U,L or D,R - } - } + double smp_extra_bits = kvz_encode_part_mode( + state, + &state->search_cabac, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), + depth + ); + // The transform is split for SMP and AMP blocks so we need more bits for // coding the CBF. smp_extra_bits += 6; From 8d12884e4318c39e99add6fae93fd56d88c1e5de Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 5 Jan 2022 11:14:44 +0200 Subject: [PATCH 61/85] disable VERBOSE --- src/global.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/global.h b/src/global.h index 9a2ee989..2ad0830b 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -#define VERBOSE 1 +// #define VERBOSE 1 /* CONFIG VARIABLES */ From 159793f5b4f028b23df2fde7a61dab129df4c5d2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 6 Jan 2022 09:12:03 +0200 Subject: [PATCH 62/85] more accurate get_mvd_coding_cost --- src/search_inter.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 93ef2333..fcd64ba2 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -247,10 +247,10 @@ static bool check_mv_cost(inter_search_info_t *info, static unsigned get_ep_ex_golomb_bitcost(unsigned symbol) { - // Calculate 2 * log2(symbol + 2) + // Calculate 2 * log2(symbol ) unsigned bins = 0; - symbol += 2; + symbol += 0; if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; } if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; } if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; } @@ -324,19 +324,21 @@ static void select_starting_point(inter_search_info_t *info, } -static double get_mvd_coding_cost(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +static double get_mvd_coding_cost(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { - double bitcost = 0; + double bitcost = 4 << CTX_FRAC_BITS; const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; + bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; // Round and shift back to integer bits. - return bitcost / (1 << CTX_FRAC_BITS); + return bitcost / (1 << CTX_FRAC_BITS); } From aaac260438c336e40f3e52f212fdf82feb2600d9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 27 Jan 2022 13:35:47 +0200 Subject: [PATCH 63/85] better merge cost --- src/search_inter.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index fcd64ba2..f56998b7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1641,6 +1641,7 @@ static void search_pu_inter(encoder_state_t * const state, merge->cost[i] = MAX_DOUBLE; } + const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { @@ -1678,8 +1679,9 @@ static void search_pu_inter(encoder_state_t * const state, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); // Add cost of coding the merge index - merge->cost[merge->size] += merge_idx * info->state->lambda_sqrt; - merge->bits[merge->size] = merge_idx; + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + merge->cost[merge->size] += bits * info->state->lambda_sqrt; + merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; merge->unit[merge->size] = *cur_pu; @@ -2013,7 +2015,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, lcu, false); - double bits; + double bits = 0; *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); if (reconstruct_chroma) { *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); From f3f0037123bc4eb85e99fef0314118c0dfe3d672 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 27 Jan 2022 13:41:19 +0200 Subject: [PATCH 64/85] include root_cbf cost --- src/search_inter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index f56998b7..9bda59b4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2016,9 +2016,14 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, false); double bits = 0; - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); - if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + *inter_bitcost += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, !!cbf); + + if(cbf) { + *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + if (reconstruct_chroma) { + *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + } } FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); From 5afd3570f6194a0c4c733793b319e6ac2d6e8071 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 08:14:57 +0200 Subject: [PATCH 65/85] Update cu_qt_root_cbf_model --- src/search.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/search.c b/src/search.c index ff116140..246583d1 100644 --- a/src/search.c +++ b/src/search.c @@ -454,6 +454,9 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + if(pred_cu->type == CU_INTER && !pred_cu->skipped && depth == pred_cu->depth) { + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_qt_root_cbf_model, cbf_is_set_any(pred_cu->cbf, depth), tr_tree_bits, "root_cbf"); + } // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; From 1a9e54601fc68dd461cb1e508fc78651ab9f2622 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 09:08:25 +0200 Subject: [PATCH 66/85] Calculate rd2 cost for all inter modes instead of just the final one --- src/search_inter.c | 36 ++++++++++++++++++++++-------------- src/search_inter.h | 5 +++++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 9bda59b4..dadd3df7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1673,13 +1673,17 @@ static void search_pu_inter(encoder_state_t * const state, } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - - merge->cost[merge->size] = kvz_satd_any_size(width, height, - lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, - lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); - - // Add cost of coding the merge index + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + if(state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &merge->cost[merge->size], &bits); + } + else { + merge->cost[merge->size] = kvz_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + } + // Add cost of coding the merge index merge->cost[merge->size] += bits * info->state->lambda_sqrt; merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; @@ -1769,6 +1773,10 @@ static void search_pu_inter(encoder_state_t * const state, amvp[0].size > 0 ? amvp[0].keys[0] : 0, amvp[1].size > 0 ? amvp[1].keys[0] : 0 }; + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + } cu_info_t *best_unipred[2] = { &amvp[0].unit[best_keys[0]], @@ -1850,6 +1858,10 @@ static void search_pu_inter(encoder_state_t * const state, unipred_pu->inter.mv[list][1] = frac_mv.y; CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &frac_cost, &frac_bits); + } + amvp[list].cost[key] = frac_cost; amvp[list].bits[key] = frac_bits; } @@ -1919,6 +1931,7 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; + best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); @@ -1971,6 +1984,9 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); + } } } @@ -2115,14 +2131,6 @@ void kvz_search_cu_inter(encoder_state_t * const state, cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; - // Calculate more accurate cost when needed - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); - } if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); diff --git a/src/search_inter.h b/src/search_inter.h index bb9067c5..da547d90 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -92,5 +92,10 @@ unsigned kvz_inter_satd_cost(const encoder_state_t* state, const lcu_t *lcu, int x, int y); +void kvz_cu_cost_inter_rd2(encoder_state_t* const state, + int x, int y, int depth, + lcu_t* lcu, + double* inter_cost, + double* inter_bitcost); #endif // SEARCH_INTER_H_ From 6d73db5a2a44e1caf3bcc217dea36a631e8756af Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 12:26:12 +0200 Subject: [PATCH 67/85] Probably correct RD cost calculation for all inter modes --- src/search.c | 33 ++++++------ src/search.h | 9 ++++ src/search_inter.c | 123 +++++++++++++++++++++++++++++++++++++-------- src/search_inter.h | 3 ++ 4 files changed, 130 insertions(+), 38 deletions(-) diff --git a/src/search.c b/src/search.c index 246583d1..185e89fa 100644 --- a/src/search.c +++ b/src/search.c @@ -60,14 +60,6 @@ // Cost threshold for doing intra search in inter frames with --rd=0. static const int INTRA_THRESHOLD = 8; -// Modify weight of luma SSD. -#ifndef LUMA_MULT -# define LUMA_MULT 0.8 -#endif -// Modify weight of chroma SSD. -#ifndef CHROMA_MULT -# define CHROMA_MULT 1.5 -#endif static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to) { @@ -216,16 +208,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); double ssd = 0.0; - ssd += LUMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_LUMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], LCU_WIDTH, LCU_WIDTH, cu_width ); if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) { - ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); - ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); @@ -253,6 +245,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, double *bit_cost) { const int width = LCU_WIDTH >> depth; + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); // cur_cu is used for TU parameters. cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -280,7 +273,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag - && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth + && !skip_residual_coding) { cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); @@ -300,7 +294,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, } - if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { + if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) { // Because these need to be coded before the luma cbf they also need to be counted // before the cabac state changes. However, since this branch is only executed when // calculating the last RD cost it is not problem to include the chroma cbf costs in @@ -340,7 +334,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, width); } - { + + if (!skip_residual_coding) { int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; @@ -349,7 +344,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * LUMA_MULT + bits * state->lambda; + return (double)ssd * KVZ_LUMA_MULT + bits * state->lambda; } @@ -362,6 +357,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); double tr_tree_bits = 0; double coeff_bits = 0; @@ -376,7 +372,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } // See luma for why the second condition - if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth)) { + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) { const int tr_depth = depth - pred_cu->depth; cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); @@ -417,6 +413,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, ssd = ssd_u + ssd_v; } + if (!skip_residual_coding) { int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); @@ -427,7 +424,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * CHROMA_MULT + bits * state->lambda; + return (double)ssd * KVZ_CHROMA_MULT + bits * state->lambda; } static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, @@ -553,7 +550,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, } *bit_cost += coeff_bits; double bits = tr_tree_bits + coeff_bits; - return luma_ssd * LUMA_MULT + chroma_ssd * CHROMA_MULT + bits * state->lambda; + return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda; } diff --git a/src/search.h b/src/search.h index b11a0ad5..bcd517cb 100644 --- a/src/search.h +++ b/src/search.h @@ -46,6 +46,15 @@ #define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS) + // Modify weight of luma SSD. +#ifndef KVZ_LUMA_MULT +# define KVZ_LUMA_MULT 0.8 +#endif +// Modify weight of chroma SSD. +#ifndef KVZ_CHROMA_MULT +# define KVZ_CHROMA_MULT 1.5 +#endif + /** * \brief Data collected during search processes. * diff --git a/src/search_inter.c b/src/search_inter.c index dadd3df7..d1a031ac 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1160,6 +1160,30 @@ static void search_frac(inter_search_info_t *info, *best_bits = bitcost; } +int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a) { + assert(!(lcu && cu_a)); + int context = 0; + if(lcu) { + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + if (x) { + context += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->skipped; + } + if (y) { + context += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->skipped; + } + } + else { + if (x > 0) { + context += kvz_cu_array_at_const(cu_a, x - 1, y)->skipped; + } + if (y > 0) { + context += kvz_cu_array_at_const(cu_a, x, y - 1)->skipped; + } + } + return context; +} + /** * \brief Calculate the scaled MV */ @@ -1676,7 +1700,7 @@ static void search_pu_inter(encoder_state_t * const state, double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); if(state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &merge->cost[merge->size], &bits); + kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); } else { merge->cost[merge->size] = kvz_satd_any_size(width, height, @@ -1773,10 +1797,6 @@ static void search_pu_inter(encoder_state_t * const state, amvp[0].size > 0 ? amvp[0].keys[0] : 0, amvp[1].size > 0 ? amvp[1].keys[0] : 0 }; - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); - } cu_info_t *best_unipred[2] = { &amvp[0].unit[best_keys[0]], @@ -1808,6 +1828,11 @@ static void search_pu_inter(encoder_state_t * const state, } } + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + } + // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { @@ -1859,7 +1884,7 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &frac_cost, &frac_bits); + kvz_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); } amvp[list].cost[key] = frac_cost; @@ -1985,7 +2010,7 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } @@ -2012,39 +2037,96 @@ static void search_pu_inter(encoder_state_t * const state, */ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, int x, int y, int depth, + cu_info_t* cur_cu, lcu_t *lcu, double *inter_cost, double* inter_bitcost){ - - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + int tr_depth = MAX(1, depth); if (cur_cu->part_size != SIZE_2Nx2N) { tr_depth = depth + 1; } kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth); + const int x_px = SUB_SCU(x); + const int y_px = SUB_SCU(y); + const int width = LCU_WIDTH >> depth; + const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); - kvz_quantize_lcu_residual(state, true, reconstruct_chroma, - x, y, depth, - NULL, - lcu, - false); + int index = y_px * LCU_WIDTH + x_px; + double ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width) * KVZ_LUMA_MULT; + if (reconstruct_chroma) { + int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; + double ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + double ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + ssd += ssd_u + ssd_v; + ssd *= KVZ_CHROMA_MULT; + } + double no_cbf_bits; double bits = 0; - int cbf = cbf_is_set_any(cur_cu->cbf, depth); - *inter_bitcost += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, !!cbf); + int skip_context = kvz_get_skip_context(x, y, lcu, NULL); + if (cur_cu->merged) { + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); + } + else { + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0); + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1); + } + double no_cbf_cost = ssd + (no_cbf_bits + *inter_bitcost) * state->lambda; + kvz_quantize_lcu_residual(state, true, reconstruct_chroma, + x, y, depth, + NULL, + lcu, + false); + + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + + double temp_bits = 0; if(cbf) { - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); } } + else { + // If we have no coeffs after quant we already have the cost calculated + *inter_cost = no_cbf_cost; + if(cur_cu->merged) { + *inter_bitcost += no_cbf_bits; + } + return; + } FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - *inter_cost += *inter_bitcost * state->lambda; + *inter_cost += (*inter_bitcost +bits )* state->lambda; + + if(no_cbf_cost < *inter_cost && 0) { + cur_cu->cbf = 0; + if (cur_cu->merged) { + cur_cu->skipped = 1; + } + kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); + *inter_cost = no_cbf_cost; + if (cur_cu->merged) { + *inter_bitcost += no_cbf_bits; + } + } + else if(cur_cu->merged) { + if (cur_cu->merged) { + *inter_bitcost += bits; + } + } } @@ -2267,7 +2349,8 @@ void kvz_search_cu_smp(encoder_state_t * const state, // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, + x, y, depth, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), lcu, inter_cost, inter_bitcost); diff --git a/src/search_inter.h b/src/search_inter.h index da547d90..41988033 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -94,8 +94,11 @@ unsigned kvz_inter_satd_cost(const encoder_state_t* state, int y); void kvz_cu_cost_inter_rd2(encoder_state_t* const state, int x, int y, int depth, + cu_info_t* cur_cu, lcu_t* lcu, double* inter_cost, double* inter_bitcost); +int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a); + #endif // SEARCH_INTER_H_ From a0e7165df4048c466403e784df52231707ff4081 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 31 Jan 2022 08:33:31 +0200 Subject: [PATCH 68/85] use correct pu for rd calc --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index d1a031ac..42a577ba 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2085,7 +2085,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, kvz_quantize_lcu_residual(state, true, reconstruct_chroma, x, y, depth, - NULL, + cur_cu, lcu, false); From a9255901d9e2b2a8b52d7e2dc6e481d33e4b782f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 31 Jan 2022 09:31:44 +0200 Subject: [PATCH 69/85] Only perform rd2 calculation on the best candidate of the list if it exists But only for 2Nx2N blocks --- src/search_inter.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 42a577ba..92d96303 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1699,7 +1699,7 @@ static void search_pu_inter(encoder_state_t * const state, kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); - if(state->encoder_control->cfg.rdo >= 2) { + if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); } else { @@ -1828,9 +1828,9 @@ static void search_pu_inter(encoder_state_t * const state, } } - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); } // Fractional-pixel motion estimation. @@ -1883,7 +1883,7 @@ static void search_pu_inter(encoder_state_t * const state, unipred_pu->inter.mv[list][1] = frac_mv.y; CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); - if (state->encoder_control->cfg.rdo >= 2) { + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); } @@ -2009,7 +2009,7 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); - if (state->encoder_control->cfg.rdo >= 2) { + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } @@ -2051,6 +2051,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, const int x_px = SUB_SCU(x); const int y_px = SUB_SCU(y); const int width = LCU_WIDTH >> depth; + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + *cur_pu = *cur_cu; const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); @@ -2063,12 +2065,11 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; double ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); + width / 2); double ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); - ssd += ssd_u + ssd_v; - ssd *= KVZ_CHROMA_MULT; + width / 2); + ssd += (ssd_u + ssd_v) * KVZ_CHROMA_MULT; } double no_cbf_bits; double bits = 0; From ff02a84a96bc785449f3ddd58fb66b3989f20518 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 13:20:51 +0200 Subject: [PATCH 70/85] Probably better order of things --- src/search_inter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 92d96303..b504ed57 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1697,10 +1697,18 @@ static void search_pu_inter(encoder_state_t * const state, } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_INTER; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + if(state->encoder_control->cfg.early_skip && merge->unit[merge->size].skipped) { + + } } else { merge->cost[merge->size] = kvz_satd_any_size(width, height, @@ -1712,11 +1720,6 @@ static void search_pu_inter(encoder_state_t * const state, merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; - merge->unit[merge->size] = *cur_pu; - merge->unit[merge->size].type = CU_INTER; - merge->unit[merge->size].merge_idx = merge_idx; - merge->unit[merge->size].merged = true; - merge->unit[merge->size].skipped = false; merge->size++; } From 71b1e59548d896868e184593405901440a3d6258 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 13:24:57 +0200 Subject: [PATCH 71/85] Better early-skip? --- src/search_inter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index b504ed57..0bbca858 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1707,7 +1707,12 @@ static void search_pu_inter(encoder_state_t * const state, if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); if(state->encoder_control->cfg.early_skip && merge->unit[merge->size].skipped) { - + *cur_pu = merge->unit[merge->size]; + merge->unit[0] = *cur_pu; + merge->size = 1; + merge->cost[0] = merge->cost[merge->size]; + merge->bits[0] = bits; + return; } } else { @@ -1732,7 +1737,7 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N && cfg->rdo < 2) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { // Reconstruct blocks with merge candidate. @@ -2115,7 +2120,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, *inter_cost += (*inter_bitcost +bits )* state->lambda; - if(no_cbf_cost < *inter_cost && 0) { + if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; if (cur_cu->merged) { cur_cu->skipped = 1; From c7174b25cf8204f6761833485b01f9defa0d1c08 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 1 Feb 2022 14:16:38 +0200 Subject: [PATCH 72/85] smp/amp CUs cannot be skipped --- src/search_inter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 0bbca858..dfd5563c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2082,7 +2082,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, double no_cbf_bits; double bits = 0; int skip_context = kvz_get_skip_context(x, y, lcu, NULL); - if (cur_cu->merged) { + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); } @@ -2110,7 +2110,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, else { // If we have no coeffs after quant we already have the cost calculated *inter_cost = no_cbf_cost; - if(cur_cu->merged) { + if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { *inter_bitcost += no_cbf_bits; } return; @@ -2122,12 +2122,12 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; - if (cur_cu->merged) { + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { cur_cu->skipped = 1; } kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); *inter_cost = no_cbf_cost; - if (cur_cu->merged) { + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { *inter_bitcost += no_cbf_bits; } } From 3ac17ffd9525967e5c4e8bc7cbb3f703993b7204 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 2 Feb 2022 09:51:25 +0200 Subject: [PATCH 73/85] better early skip? --- src/search_inter.c | 80 +++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index dfd5563c..cb7c9683 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1706,14 +1706,6 @@ static void search_pu_inter(encoder_state_t * const state, double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); - if(state->encoder_control->cfg.early_skip && merge->unit[merge->size].skipped) { - *cur_pu = merge->unit[merge->size]; - merge->unit[0] = *cur_pu; - merge->size = 1; - merge->cost[0] = merge->cost[merge->size]; - merge->bits[0] = bits; - return; - } } else { merge->cost[merge->size] = kvz_satd_any_size(width, height, @@ -1737,41 +1729,49 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N && cfg->rdo < 2) { + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { - - // Reconstruct blocks with merge candidate. - // Check luma CBF. Then, check chroma CBFs if luma CBF is not set - // and chroma exists. - // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; - cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; - cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; - cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; - cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; - cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; - cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; - cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; - kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); - kvz_inter_recon_cu(state, lcu, x, y, width, true, false); - kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); - - if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { - continue; + if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { + merge->size = 1; + merge->bits[0] = merge->bits[merge->keys[merge_key]]; + merge->cost[0] = merge->cost[merge->keys[merge_key]]; + merge->unit[0] = merge->unit[merge->keys[merge_key]]; + merge->keys[0] = 0; } - else if (has_chroma) { - kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); - if (!cbf_is_set_any(cur_pu->cbf, depth)) { - cur_pu->type = CU_INTER; - cur_pu->merge_idx = merge_idx; - cur_pu->skipped = true; + else if(cfg->rdo < 2) { + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; + kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + kvz_inter_recon_cu(state, lcu, x, y, width, true, false); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); - merge->size = 1; - merge->cost[0] = 0.0; // TODO: Check this - merge->bits[0] = merge_idx; // TODO: Check this - merge->unit[0] = *cur_pu; - return; + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + continue; + } + else if (has_chroma) { + kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_INTER; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; + return; + } } } } From 8cd81e3dcf5c6039816d1781ff3dcee8d3daf077 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 2 Feb 2022 10:11:40 +0200 Subject: [PATCH 74/85] Only count smp extra cbf bits when rd < 2 --- src/search_inter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index cb7c9683..abeff412 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2351,7 +2351,9 @@ void kvz_search_cu_smp(encoder_state_t * const state, // The transform is split for SMP and AMP blocks so we need more bits for // coding the CBF. - smp_extra_bits += 6; + if(state->encoder_control->cfg.rdo < 2) { + smp_extra_bits += 6; + } *inter_bitcost += smp_extra_bits; From 49c8334dd7b88279892a41a1427982463155a3b0 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 2 Feb 2022 13:31:59 +0200 Subject: [PATCH 75/85] count skip flag --- src/search.c | 1 + src/search_inter.c | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/search.c b/src/search.c index 185e89fa..b4581fe7 100644 --- a/src/search.c +++ b/src/search.c @@ -785,6 +785,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if(state->frame->slicetype != KVZ_SLICE_I) { double pred_mode_type_bits = 0; CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag"); intra_cost += pred_mode_type_bits * state->lambda; } if (intra_cost < cost) { diff --git a/src/search_inter.c b/src/search_inter.c index abeff412..e16ac483 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2021,7 +2021,15 @@ static void search_pu_inter(encoder_state_t * const state, kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } - + const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); + for(int i = 0; i < 3; i++) { + if(amvp[i].size > 0) { + const uint8_t best_key = amvp[i].keys[0]; + amvp[i].bits[best_key] += no_skip_flag; + amvp[i].cost[best_key] += no_skip_flag * state->lambda; + } + } } /** @@ -2081,14 +2089,15 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, } double no_cbf_bits; double bits = 0; - int skip_context = kvz_get_skip_context(x, y, lcu, NULL); + const int skip_context = kvz_get_skip_context(x, y, lcu, NULL); + double no_skip_flag_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; } else { - no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0); - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1); + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0) + no_skip_flag_bits; + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; } double no_cbf_cost = ssd + (no_cbf_bits + *inter_bitcost) * state->lambda; @@ -2118,7 +2127,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - *inter_cost += (*inter_bitcost +bits )* state->lambda; + *inter_cost += (*inter_bitcost + bits)* state->lambda; if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; @@ -2131,10 +2140,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, *inter_bitcost += no_cbf_bits; } } - else if(cur_cu->merged) { - if (cur_cu->merged) { - *inter_bitcost += bits; - } + else if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + *inter_bitcost += no_skip_flag_bits; } } @@ -2349,6 +2356,8 @@ void kvz_search_cu_smp(encoder_state_t * const state, depth ); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, smp_extra_bits, "skip_flag"); + // The transform is split for SMP and AMP blocks so we need more bits for // coding the CBF. if(state->encoder_control->cfg.rdo < 2) { From 2ac9daf6e4ce5a937fe117e25d5870441a39d7d1 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 3 Feb 2022 10:02:48 +0200 Subject: [PATCH 76/85] accurate inter bit cost during search --- src/search_inter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index e16ac483..55d6c3f2 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2067,6 +2067,10 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, const int x_px = SUB_SCU(x); const int y_px = SUB_SCU(y); const int width = LCU_WIDTH >> depth; + cabac_data_t cabac_copy; + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); + cabac_copy.update = 1; + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); *cur_pu = *cur_cu; @@ -2090,16 +2094,15 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, double no_cbf_bits; double bits = 0; const int skip_context = kvz_get_skip_context(x, y, lcu, NULL); - double no_skip_flag_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost; + bits += kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); } else { - no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0) + no_skip_flag_bits; - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; + no_cbf_bits = kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); + bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 1); } - double no_cbf_cost = ssd + (no_cbf_bits + *inter_bitcost) * state->lambda; + double no_cbf_cost = ssd + no_cbf_bits * state->lambda; kvz_quantize_lcu_residual(state, true, reconstruct_chroma, x, y, depth, @@ -2120,14 +2123,15 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, // If we have no coeffs after quant we already have the cost calculated *inter_cost = no_cbf_cost; if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost += no_cbf_bits; + *inter_bitcost = no_cbf_bits; } return; } FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - *inter_cost += (*inter_bitcost + bits)* state->lambda; + *inter_cost += (bits)* state->lambda; + *inter_bitcost = bits; if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; @@ -2136,12 +2140,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, } kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); *inter_cost = no_cbf_cost; - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost += no_cbf_bits; - } - } - else if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost += no_skip_flag_bits; + *inter_bitcost = no_cbf_bits; + } } From d720305feacfdaf650d6767e7d8f838eaae54902 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 3 Feb 2022 11:45:12 +0200 Subject: [PATCH 77/85] Don't double count some of the bits --- src/search.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/search.c b/src/search.c index b4581fe7..a320f5be 100644 --- a/src/search.c +++ b/src/search.c @@ -451,9 +451,6 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; - if(pred_cu->type == CU_INTER && !pred_cu->skipped && depth == pred_cu->depth) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_qt_root_cbf_model, cbf_is_set_any(pred_cu->cbf, depth), tr_tree_bits, "root_cbf"); - } // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; @@ -753,9 +750,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); } } - double pred_mode_type_bits = 0; - CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 0, pred_mode_type_bits, "pred_mode_flag"); - cost += pred_mode_type_bits * state->lambda; } } From d1ba62aea9ab2025c11700e2b7c9a922ab3aabc0 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 4 Feb 2022 10:25:16 +0200 Subject: [PATCH 78/85] Better inter bit_costs for rdo < 2 --- src/encode_coding_tree.c | 10 ------ src/search.c | 16 ++++++++-- src/search_inter.c | 69 +++++++++++++++++++++------------------- 3 files changed, 50 insertions(+), 45 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index ffd8ae1e..d7b80fb7 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1058,16 +1058,6 @@ double kvz_mock_encode_coding_unit( kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits); } - - { - int cbf = cbf_is_set_any(cur_cu->cbf, depth); - // Only need to signal coded block flag if not skipped or merged - // skip = no coded residual, merge = coded residual - if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, bits, "rqt_root_cbf"); - } - - } } else if (cur_cu->type == CU_INTRA) { encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); diff --git a/src/search.c b/src/search.c index a320f5be..ef0587eb 100644 --- a/src/search.c +++ b/src/search.c @@ -452,6 +452,15 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + { + int cbf = cbf_is_set_any(pred_cu->cbf, depth); + // Only need to signal coded block flag if not skipped or merged + // skip = no coded residual, merge = coded residual + if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); + } + + } // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; int max_tr_depth; @@ -851,9 +860,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU - if (inter_bitcost > 1) { - inter_bitcost -= 1; - } + int skip_ctx = kvz_get_skip_context(x, y, lcu, NULL); + inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1); + inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0); + inter_bitcost += cur_cu->merge_idx; } } lcu_fill_inter(lcu, x_local, y_local, cu_width); diff --git a/src/search_inter.c b/src/search_inter.c index 55d6c3f2..1c8e2fd0 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1666,6 +1666,7 @@ static void search_pu_inter(encoder_state_t * const state, } const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0); // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { @@ -1711,6 +1712,7 @@ static void search_pu_inter(encoder_state_t * const state, merge->cost[merge->size] = kvz_satd_any_size(width, height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + bits += no_skip_flag; } // Add cost of coding the merge index merge->cost[merge->size] += bits * info->state->lambda_sqrt; @@ -1836,11 +1838,6 @@ static void search_pu_inter(encoder_state_t * const state, } } - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); - } - // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { @@ -1914,6 +1911,11 @@ static void search_pu_inter(encoder_state_t * const state, amvp[list].size = n_best; } + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) { + if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + } + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -2021,13 +2023,16 @@ static void search_pu_inter(encoder_state_t * const state, kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } - const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); - const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); - for(int i = 0; i < 3; i++) { - if(amvp[i].size > 0) { - const uint8_t best_key = amvp[i].keys[0]; - amvp[i].bits[best_key] += no_skip_flag; - amvp[i].cost[best_key] += no_skip_flag * state->lambda; + if(cfg->rdo < 2) { + const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); + const double part_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1); + for(int i = 0; i < 3; i++) { + if(amvp[i].size > 0) { + const uint8_t best_key = amvp[i].keys[0]; + amvp[i].bits[best_key] += no_skip_flag + part_mode_bits; + amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda; + } } } } @@ -2256,7 +2261,7 @@ void kvz_search_cu_inter(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void kvz_search_cu_smp(encoder_state_t * const state, +void kvz_search_cu_smp(encoder_state_t* const state, int x, int y, int depth, part_mode_t part_mode, @@ -2281,19 +2286,19 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost = 0; *inter_bitcost = 0; - + for (int i = 0; i < num_pu; ++i) { const int x_pu = PU_GET_X(part_mode, width, x_local, i); const int y_pu = PU_GET_Y(part_mode, width, y_local, i); const int width_pu = PU_GET_W(part_mode, width, i); const int height_pu = PU_GET_H(part_mode, width, i); - double cost = MAX_DOUBLE; + double cost = MAX_DOUBLE; double bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); - cu_info_t *best_inter_pu = NULL; + cu_info_t* best_inter_pu = NULL; // Find best AMVP PU for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { @@ -2301,7 +2306,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, int best_key = amvp[mv_dir - 1].keys[0]; if (amvp[mv_dir - 1].size > 0 && - amvp[mv_dir - 1].cost[best_key] < cost) { + amvp[mv_dir - 1].cost[best_key] < cost) { best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; cost = amvp[mv_dir - 1].cost[best_key]; @@ -2329,12 +2334,12 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost += cost; *inter_bitcost += bitcost; - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); *cur_pu = *best_inter_pu; for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { - cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); + cu_info_t* scu = LCU_GET_CU_AT_PX(lcu, x, y); scu->type = CU_INTER; scu->inter = cur_pu->inter; } @@ -2348,23 +2353,23 @@ void kvz_search_cu_smp(encoder_state_t * const state, assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } } + double smp_extra_bits = 0; + if (state->encoder_control->cfg.rdo < 2) { + smp_extra_bits = kvz_encode_part_mode( + state, + &state->search_cabac, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), + depth + ); - double smp_extra_bits = kvz_encode_part_mode( - state, - &state->search_cabac, - LCU_GET_CU_AT_PX(lcu, x_local, y_local), - depth - ); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, smp_extra_bits, "skip_flag"); - CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, smp_extra_bits, "skip_flag"); - - // The transform is split for SMP and AMP blocks so we need more bits for - // coding the CBF. - if(state->encoder_control->cfg.rdo < 2) { + // The transform is split for SMP and AMP blocks so we need more bits for + // coding the CBF. smp_extra_bits += 6; - } - *inter_bitcost += smp_extra_bits; + *inter_bitcost += smp_extra_bits; + } // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { From e0ed91658b4a7d3122b70d2fc3fc129dbc904605 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 8 Feb 2022 08:11:23 +0200 Subject: [PATCH 79/85] Fix no-early-skip without breaking early-skip --- src/search_inter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 1c8e2fd0..430a40c9 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1713,9 +1713,9 @@ static void search_pu_inter(encoder_state_t * const state, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); bits += no_skip_flag; + merge->cost[merge->size] += bits * info->state->lambda_sqrt; } // Add cost of coding the merge index - merge->cost[merge->size] += bits * info->state->lambda_sqrt; merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; @@ -2127,9 +2127,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, else { // If we have no coeffs after quant we already have the cost calculated *inter_cost = no_cbf_cost; - if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost = no_cbf_bits; - } + cur_cu->cbf = 0; + *inter_bitcost = no_cbf_bits; return; } @@ -2143,7 +2142,6 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { cur_cu->skipped = 1; } - kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); *inter_cost = no_cbf_cost; *inter_bitcost = no_cbf_bits; @@ -2233,7 +2231,9 @@ void kvz_search_cu_inter(encoder_state_t * const state, const int y_local = SUB_SCU(y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; - + + kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), + true, state->encoder_control->chroma_format != KVZ_CSP_400); if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); From 00516d3dceaffedc2977d6c6086d2574a87281a2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 9 Feb 2022 09:51:21 +0200 Subject: [PATCH 80/85] Make sure intra does not accidentally skip coeff cost calculation --- src/search_intra.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/search_intra.c b/src/search_intra.c index 2986f67f..07dfb798 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -605,6 +605,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.depth = depth; pred_cu.type = CU_INTRA; pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); + pred_cu.skipped = 0; + pred_cu.merged = 0; pred_cu.intra.mode = modes[rdo_mode]; pred_cu.intra.mode_chroma = modes[rdo_mode]; FILL(pred_cu.cbf, 0); From b0037b814d20106c33ce5cc4885ec9b400604962 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 14 Mar 2022 12:15:03 +0200 Subject: [PATCH 81/85] Use correct lambda # Conflicts: # src/search_inter.c --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 430a40c9..c2c69c00 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2031,7 +2031,7 @@ static void search_pu_inter(encoder_state_t * const state, if(amvp[i].size > 0) { const uint8_t best_key = amvp[i].keys[0]; amvp[i].bits[best_key] += no_skip_flag + part_mode_bits; - amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda; + amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda_sqrt; } } } From 1ae5ecdec5eb024c7e040399db06992de011d6dc Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 14 Mar 2022 12:17:59 +0200 Subject: [PATCH 82/85] include pred_mode_bits for 2Nx2N inter pus for rd=0/1 # Conflicts: # src/search_inter.c --- src/search_inter.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index c2c69c00..abc44278 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2026,12 +2026,16 @@ static void search_pu_inter(encoder_state_t * const state, if(cfg->rdo < 2) { const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); - const double part_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1); + const double part_mode_bits = state->encoder_control->cfg.smp_enable || state->encoder_control->cfg.amp_enable ? + CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1) + : 0; + const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 0); + const double total_bits = no_skip_flag + part_mode_bits + pred_mode_bits; for(int i = 0; i < 3; i++) { if(amvp[i].size > 0) { const uint8_t best_key = amvp[i].keys[0]; - amvp[i].bits[best_key] += no_skip_flag + part_mode_bits; - amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda_sqrt; + amvp[i].bits[best_key] += total_bits; + amvp[i].cost[best_key] += (total_bits)* state->lambda_sqrt; } } } From 352d6750f583f4c94f009ad6425045dd8ec275a8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 14 Mar 2022 14:33:36 +0200 Subject: [PATCH 83/85] Remove instrumentation code --- src/cabac.c | 3 -- src/cabac.h | 14 ++------ src/encode_coding_tree.c | 7 ---- src/encoderstate.c | 2 -- src/sao.c | 3 -- src/search.c | 74 ++++++++++++++-------------------------- src/search.h | 12 +++---- src/search_inter.c | 12 +++---- src/search_intra.c | 29 ++++++---------- 9 files changed, 48 insertions(+), 108 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index ae31fb0b..7cd7d926 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -37,8 +37,6 @@ #include "extras/crypto.h" #include "kvazaar.h" -FILE* bit_cost_file = NULL; - const uint8_t kvz_g_auc_next_state_mps[128] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, @@ -107,7 +105,6 @@ void kvz_cabac_encode_bin(cabac_data_t * const data, const uint32_t bin_value) { uint32_t lps; - if (!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (bin_value)); lps = kvz_g_auc_lpst_table[CTX_STATE(data->cur_ctx)][(data->range >> 6) & 3]; data->range -= lps; diff --git a/src/cabac.h b/src/cabac.h index 6c46011b..b15cbb75 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -42,8 +42,6 @@ #include "bitstream.h" -extern FILE* bit_cost_file; - struct encoder_state_t; // Types @@ -139,7 +137,6 @@ extern const float kvz_f_entropy_bits[128]; CABAC_BIN((cabac), (val), (name));\ } \ } while(0) -extern double bits_written; // Macros #define CTX_STATE(ctx) ((ctx)->uc_state >> 1) @@ -147,30 +144,23 @@ extern double bits_written; #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; } -#ifdef VERBOSE -#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) -#else -#define FILE_BITS(bits, x, y, depth, name) {} -#endif #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bin((data), (value)); \ - if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u bits = %f\n", \ - (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx), bits_written); } + if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u\n", \ + (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx)); } #define CABAC_BINS_EP(data, value, bins, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bins_ep((data), (value), (bins)); \ - if(!(data)->only_count) bits_written += (bins); \ if(!(data)->only_count) printf("%s = %u(%u bins), state = %u -> %u\n", \ (name), (uint32_t)(value), (bins), prev_state, (data)->cur_ctx->uc_state); } #define CABAC_BIN_EP(data, value, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bin_ep((data), (value)); \ - if(!(data)->only_count) bits_written += 1; \ if(!(data)->only_count) printf("%s = %u, state = %u -> %u\n", \ (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state); } #else diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index d7b80fb7..afff8a06 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -943,13 +943,6 @@ end: if (is_last_cu_in_qg(state, x, y, depth)) { state->last_qp = cur_cu->qp; } -#ifdef VERBOSE - if((x % 64 != 0 && y % 64 != 0) || 1) { - fprintf(stderr, "%f\t%d\t%d\t%d\n", bits_written, x, y, depth); - bits_written = 0; - } -#endif - } double kvz_mock_encode_coding_unit( diff --git a/src/encoderstate.c b/src/encoderstate.c index d02ca483..f187ca61 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -1661,11 +1661,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s } } -double bits_written; void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame) { - bits_written = 0; encoder_state_init_new_frame(state, frame); encoder_state_encode(state); diff --git a/src/sao.c b/src/sao.c index b7d76e64..e3154c20 100644 --- a/src/sao.c +++ b/src/sao.c @@ -510,7 +510,6 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (state->encoder_control->cfg.sao_type & 1){ sao_search_edge_sao(state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left); float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt); - FILE_BITS(mode_bits, 0, 0, 0, "sao mode bits"); int ddistortion = (int)(mode_bits * state->lambda + 0.5); unsigned buf_i; @@ -557,7 +556,6 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ { float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left); int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5); - FILE_BITS(mode_bits_none, 0, 0, 0, "Sao cost of nothing"); if (sao_out->ddistortion >= cost_of_nothing) { sao_out->type = SAO_TYPE_NONE; merge_cost[0] = cost_of_nothing; @@ -574,7 +572,6 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (merge_cand) { unsigned buf_i; float mode_bits = sao_mode_bits_merge(state, i + 1); - FILE_BITS(mode_bits, 0, 0, 0, (i == 0 ? "sao merge ""left" : "sao merge ""top")); int ddistortion = (int)(mode_bits * state->lambda + 0.5); switch (merge_cand->type) { diff --git a/src/search.c b/src/search.c index ef0587eb..943fd9b9 100644 --- a/src/search.c +++ b/src/search.c @@ -239,10 +239,9 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, * prediction unit data needs to be coded. */ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, - double *bit_cost) + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); @@ -278,17 +277,16 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); - *bit_cost += tr_tree_bits; } if (tr_depth > 0) { int offset = width / 2; double sum = 0; - sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); return sum + tr_tree_bits * state->lambda; } @@ -322,7 +320,6 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); - *bit_cost += tr_tree_bits; } // SSD between reconstruction and original @@ -340,7 +337,6 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode); - *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -349,10 +345,9 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, - double *bit_cost) + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu) { const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; @@ -385,17 +380,16 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } - *bit_cost += tr_tree_bits; } if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); double sum = 0; - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); return sum + tr_tree_bits * state->lambda; } @@ -420,7 +414,6 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order); coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order); - *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -428,10 +421,9 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, - const int x_px, const int y_px, const int depth, - const cu_info_t* const pred_cu, - lcu_t* const lcu, - double* bit_cost) { + const int x_px, const int y_px, const int depth, + const cu_info_t* const pred_cu, + lcu_t* const lcu) { const int width = LCU_WIDTH >> depth; const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); @@ -492,12 +484,11 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, if (tr_depth > 0) { int offset = LCU_WIDTH >> (depth + 1); double sum = 0; - *bit_cost += tr_tree_bits; - sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); - sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); return sum + tr_tree_bits * state->lambda; } const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ; @@ -514,7 +505,6 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); } - *bit_cost += tr_tree_bits; // SSD between reconstruction and original unsigned luma_ssd = 0; if (!state->encoder_control->cfg.lossless) { @@ -554,7 +544,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order); } } - *bit_cost += coeff_bits; + double bits = tr_tree_bits + coeff_bits; return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda; } @@ -895,13 +885,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost = bits * state->lambda; - cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, &bits); - //if (state->encoder_control->chroma_format != KVZ_CSP_400) { - // cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); - //} - - FILE_BITS(bits, x, y, depth, "final rd bits"); - + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); + if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { cost = inter_zero_coeff_cost; @@ -958,7 +943,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } - FILE_BITS(split_bits, x, y, depth, "split"); state->search_cabac.update = 0; split_cost += split_bits * state->lambda; @@ -1023,12 +1007,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y) + bits; cost += mode_bits * state->lambda; - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - if (has_chroma) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - } - - FILE_BITS(bits, x, y, depth, "merged intra bits"); + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); @@ -1200,9 +1179,6 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { -#ifdef VERBOSE - if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); -#endif memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); diff --git a/src/search.h b/src/search.h index bcd517cb..51b30ae4 100644 --- a/src/search.h +++ b/src/search.h @@ -79,13 +79,13 @@ void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); double kvz_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, double *bits); + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu); double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, double* bits); + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu); void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index abc44278..c275a8bc 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2120,12 +2120,11 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, false); int cbf = cbf_is_set_any(cur_cu->cbf, depth); - - double temp_bits = 0; + if(cbf) { - *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); + *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); + *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); } } else { @@ -2135,9 +2134,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, *inter_bitcost = no_cbf_bits; return; } - - FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - + *inter_cost += (bits)* state->lambda; *inter_bitcost = bits; @@ -2246,7 +2243,6 @@ void kvz_search_cu_inter(encoder_state_t * const state, if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } - FILE_BITS((double)*inter_bitcost, x, y, depth, "regular inter bitcost"); } diff --git a/src/search_intra.c b/src/search_intra.c index 07dfb798..ad469859 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -179,8 +179,7 @@ static double search_intra_trdepth(encoder_state_t * const state, int x_px, int y_px, int depth, int max_depth, int intra_mode, int cost_treshold, cu_info_t *const pred_cu, - lcu_t *const lcu, - double *bit_cost) + lcu_t *const lcu) { assert(depth >= 0 && depth <= MAX_PU_DEPTH); @@ -202,7 +201,6 @@ static double search_intra_trdepth(encoder_state_t * const state, double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; - double nosplit_bits = 0; if (depth > 0) { tr_cu->tr_depth = depth; @@ -223,9 +221,9 @@ static double search_intra_trdepth(encoder_state_t * const state, intra_mode, chroma_mode, pred_cu, lcu); - nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); + nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); if (reconstruct_chroma) { - nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); + nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); } // Early stop codition for the recursive search. @@ -252,15 +250,15 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth < max_depth && depth < MAX_PU_DEPTH) { split_cost = 0; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } double tr_split_bit = 0.0; @@ -271,7 +269,6 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth >= 1 && depth <= 3) { cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split"); - *bit_cost += tr_split_bit; } // Add cost of cbf chroma bits on transform tree. @@ -290,7 +287,6 @@ static double search_intra_trdepth(encoder_state_t * const state, if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); } - *bit_cost += cbf_bits; } double bits = tr_split_bit + cbf_bits; @@ -613,9 +609,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - - double bit_costs = 0; - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, &bit_costs); + + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); costs[rdo_mode] += mode_cost; // Early termination if no coefficients has to be coded @@ -640,9 +635,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.intra.mode = modes[0]; pred_cu.intra.mode_chroma = modes[0]; FILL(pred_cu.cbf, 0); - double bit_cost = 0; - search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, &bit_cost); - FILE_BITS(bit_cost, x_px, y_px, depth, "tr_depth bits"); + search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu); } return modes_to_check; @@ -738,7 +731,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, -1, chroma.mode, // skip luma NULL, lcu); double bits = 0; - chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &bits); + chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); bits += mode_bits; From e39fbb11a7981bc7b943b321508b941f41e667d8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 16 Mar 2022 09:14:08 +0200 Subject: [PATCH 84/85] Disable bit calculations that always degrade quality --- src/search.c | 4 ++++ src/search_inter.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/search.c b/src/search.c index 943fd9b9..d1fc19a1 100644 --- a/src/search.c +++ b/src/search.c @@ -775,12 +775,16 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double intra_cost; kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_cost); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default if(state->frame->slicetype != KVZ_SLICE_I) { double pred_mode_type_bits = 0; CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag"); intra_cost += pred_mode_type_bits * state->lambda; } +#endif if (intra_cost < cost) { cost = intra_cost; cur_cu->type = CU_INTRA; diff --git a/src/search_inter.c b/src/search_inter.c index c275a8bc..d0db3e89 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1666,7 +1666,13 @@ static void search_pu_inter(encoder_state_t * const state, } const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0); +#else + const double no_skip_flag = 0; +#endif // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { From 9b7dc207b6bbc518345771b352e3dc8fe44bad8c Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 16 Mar 2022 13:50:27 +0200 Subject: [PATCH 85/85] remove unnecessary copying of cabac state --- src/encoderstate.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index f187ca61..6cf40292 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -732,8 +732,6 @@ static void encoder_state_worker_encode_lcu(void * opaque) kvz_bitstream_align_zero(state->cabac.stream); kvz_cabac_start(&state->cabac); - memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); - state->search_cabac.only_count = 1; kvz_crypto_delete(&state->crypto_hdl); } @@ -1218,8 +1216,6 @@ static void encoder_state_init_children(encoder_state_t * const state) { //Leaf states have cabac and context kvz_cabac_start(&state->cabac); kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype); - memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); - state->search_cabac.only_count = 1; } //Clear the jobs