From d6b2ec58147c76c9f8ee0f9e65b17ce336bde25e Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 18:47:14 +0200 Subject: [PATCH 001/135] Only check used reference picture lists when validating merge candidates. Merge candidate struct should be initialized to zero, so this should not have any effect. The conditions are added in case someone decides to copy the code as an example. --- src/search_inter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index f246e48b..216bbb49 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1709,8 +1709,10 @@ static void search_pu_inter(encoder_state_t * const state, // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list - if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || - !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || + bool active_L0 = cur_cu->inter.mv_dir & 1; + bool active_L1 = cur_cu->inter.mv_dir & 2; + if (active_L0 && !fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || + active_L1 && !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || is_duplicate) { continue; From f1f0033bf57a2a837177d3fd2a5136be94183039 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 2 Dec 2021 10:42:30 +0200 Subject: [PATCH 002/135] Add a cli option to control whether intra cus are tried to combine on the lower depth when search for said depth is disabled --- src/cfg.c | 5 +++++ src/cli.c | 8 ++++++++ src/kvazaar.h | 3 +++ src/search.c | 3 ++- 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/cfg.c b/src/cfg.c index 07c71a55..c8a3dfa4 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -183,6 +183,8 @@ int kvz_config_init(kvz_config *cfg) cfg->fastrd_sampling_on = 0; cfg->fastrd_accuracy_check_on = 0; cfg->fastrd_learning_outdir_fn = NULL; + + cfg->combine_intra_cus = 1; return 1; } @@ -1421,6 +1423,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) else if OPT("stats-file-prefix") { cfg->stats_file_prefix = strdup(value); } + else if OPT("combine-intra-cus") { + cfg->combine_intra_cus = atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index 811537b3..2212aa9b 100644 --- a/src/cli.c +++ b/src/cli.c @@ -167,6 +167,8 @@ static const struct option long_options[] = { { "fastrd-sampling", no_argument, NULL, 0 }, { "fastrd-accuracy-check", no_argument, NULL, 0 }, { "fastrd-outdir", required_argument, NULL, 0 }, + { "combine-intra-cus", no_argument, NULL, 0 }, + { "no-combine-intra-cus", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -578,6 +580,12 @@ void print_help(void) " --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n" " learning trees, overrides the\n" " --pu-depth-intra parameter. [disabled]\n" + " --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n" + " on lower depth even when search is not\n" + " performed on said depth. Should only\n" + " be disabled if cus absolutely must not\n" + " be larger than limited by the search.\n" + " [enabled]" " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" diff --git a/src/kvazaar.h b/src/kvazaar.h index f03ffa27..0e6779b4 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -479,6 +479,9 @@ typedef struct kvz_config char *fastrd_learning_outdir_fn; + /** \brief whether to try combining intra cus at the lower depth when search + * is not performed at said depth*/ + uint8_t combine_intra_cus; } kvz_config; /** diff --git a/src/search.c b/src/search.c index 909e7aa5..d2de84cb 100644 --- a/src/search.c +++ b/src/search.c @@ -754,7 +754,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // gets used, at least in the most obvious cases, while avoiding any // searching. if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH - && x + cu_width <= frame->width && y + cu_width <= frame->height) + && x + cu_width <= frame->width && y + cu_width <= frame->height + && state->encoder_control->cfg.combine_intra_cus) { cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); From ec2f4e0bac18f9c5b077713168fb91495ab5e17a Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 19:32:45 +0200 Subject: [PATCH 003/135] Use double for RD costs in most places --- src/rdo.c | 20 ++++++++--------- src/search.c | 4 ++-- src/search_inter.c | 53 +++++++++++++++++++++++----------------------- src/search_inter.h | 2 +- src/search_intra.c | 5 ++--- src/transform.c | 8 +++---- 6 files changed, 45 insertions(+), 47 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index ec713603..5403fa61 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1029,15 +1029,15 @@ uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, * \returns int * Calculates Motion Vector cost and related costs using CABAC coding */ -uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, - int x, - int y, - int mv_shift, - int16_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, + int x, + int y, + int mv_shift, + int16_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + uint32_t *bitcost) { cabac_data_t state_cabac_copy; cabac_data_t* cabac; @@ -1174,7 +1174,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); // Store bitcost before restoring cabac - return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5); + return *bitcost * state->lambda_sqrt; } void kvz_close_rdcost_outfiles(void) diff --git a/src/search.c b/src/search.c index 909e7aa5..4345ad75 100644 --- a/src/search.c +++ b/src/search.c @@ -462,8 +462,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const encoder_control_t* ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; - double cost = MAX_INT; - double inter_zero_coeff_cost = MAX_INT; + double cost = MAX_DOUBLE; + double inter_zero_coeff_cost = MAX_DOUBLE; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; diff --git a/src/search_inter.c b/src/search_inter.c index 216bbb49..1b705e4f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -85,7 +85,7 @@ typedef struct { /** * \brief Cost of best_mv */ - uint32_t best_cost; + double best_cost; /** * \brief Bit cost of best_mv */ @@ -390,15 +390,15 @@ static int select_mv_cand(const encoder_state_t *state, } -static uint32_t calc_mvd_cost(const encoder_state_t *state, - int x, - int y, - int mv_shift, - int16_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +static double calc_mvd_cost(const encoder_state_t *state, + int x, + int y, + int mv_shift, + int16_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + uint32_t *bitcost) { uint32_t temp_bitcost = 0; uint32_t merge_idx; @@ -428,7 +428,7 @@ static uint32_t calc_mvd_cost(const encoder_state_t *state, temp_bitcost += mvd_cost; } *bitcost = temp_bitcost; - return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5); + return temp_bitcost * state->lambda_sqrt; } @@ -624,7 +624,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -732,7 +732,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -832,7 +832,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 {0, 0} }; - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -997,11 +997,12 @@ static void search_frac(inter_search_info_t *info) // Set mv to pixel precision vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; - unsigned best_cost = UINT32_MAX; + double best_cost = MAX_DOUBLE; uint32_t best_bitcost = 0; uint32_t bitcosts[4] = { 0 }; unsigned best_index = 0; +// Keep this as unsigned until SAD / SATD functions are updated unsigned costs[4] = { 0 }; ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE]; @@ -1338,7 +1339,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, default: break; } - info->best_cost = UINT32_MAX; + info->best_cost = MAX_DOUBLE; switch (cfg->ime_algorithm) { case KVZ_IME_TZ: @@ -1365,7 +1366,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, if (cfg->fme_level > 0 && info->best_cost < *inter_cost) { search_frac(info); - } else if (info->best_cost < UINT32_MAX) { + } else if (info->best_cost < MAX_DOUBLE) { // Recalculate inter cost with SATD. info->best_cost = kvz_image_calc_satd( info->state->tile->frame->source, @@ -1376,7 +1377,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), info->width, info->height); - info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5); + info->best_cost += info->best_bitcost * info->state->lambda_sqrt; } mv = info->best_mv; @@ -1504,7 +1505,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &frame->source->y[x + y * frame->source->width]; - uint32_t cost = + double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->width); uint32_t bitcost[2] = { 0, 0 }; @@ -1529,7 +1530,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, merge_cand[j].ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info->state->lambda_sqrt * extra_bits + 0.5; + cost += info->state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; @@ -1630,7 +1631,7 @@ static void search_pu_inter(encoder_state_t * const state, double *inter_cost, uint32_t *inter_bitcost) { - *inter_cost = MAX_INT; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; const kvz_config *cfg = &state->encoder_control->cfg; @@ -1826,7 +1827,7 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - uint32_t cost = + double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); uint32_t bitcost[2] = { 0, 0 }; @@ -1851,7 +1852,7 @@ static void search_pu_inter(encoder_state_t * const state, unipreds[1].inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info.state->lambda_sqrt * extra_bits + 0.5; + cost += info.state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; @@ -2056,14 +2057,14 @@ void kvz_search_cu_smp(encoder_state_t * const state, cur_pu->depth = depth; cur_pu->qp = state->qp; - double cost = MAX_INT; + double cost = MAX_DOUBLE; uint32_t bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); - if (cost >= MAX_INT) { + if (cost == MAX_DOUBLE) { // Could not find any motion vector. - *inter_cost = MAX_INT; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; return; } diff --git a/src/search_inter.h b/src/search_inter.h index 0d7fb81b..8b4b16f2 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -64,7 +64,7 @@ enum hpel_position { HPEL_POS_DIA = 2 }; -typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state, +typedef double kvz_mvd_cost_func(const encoder_state_t *state, int x, int y, int mv_shift, int16_t mv_cand[2][2], diff --git a/src/search_intra.c b/src/search_intra.c index 9cf984db..6d3aa141 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -524,9 +524,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Add prediction mode coding cost as the last thing. We don't want this // affecting the halving search. - int lambda_cost = (int)(state->lambda_sqrt + 0.5); for (int mode_i = 0; mode_i < modes_selected; ++mode_i) { - costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds); + costs[mode_i] += state->lambda_sqrt * kvz_luma_mode_bits(state, modes[mode_i], intra_preds); } #undef PARALLEL_BLKS @@ -595,7 +594,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) { int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds); - costs[rdo_mode] = rdo_bitcost * (int)(state->lambda + 0.5); + costs[rdo_mode] = rdo_bitcost * state->lambda; // Perform transform split search and save mode RD cost for the best one. cu_info_t pred_cu; diff --git a/src/transform.c b/src/transform.c index f8e6325f..7a339e27 100644 --- a/src/transform.c +++ b/src/transform.c @@ -250,25 +250,23 @@ int kvz_quantize_residual_trskip( struct { kvz_pixel rec[4*4]; coeff_t coeff[4*4]; - uint32_t cost; + double cost; int has_coeffs; } skip, noskip, *best; - - const int bit_cost = (int)(state->lambda + 0.5); noskip.has_coeffs = kvz_quantize_residual( state, cur_cu, width, color, scan_order, 0, in_stride, 4, ref_in, pred_in, noskip.rec, noskip.coeff, false); noskip.cost = kvz_pixels_calc_ssd(ref_in, noskip.rec, in_stride, 4, 4); - noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost; + noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * state->lambda; skip.has_coeffs = kvz_quantize_residual( state, cur_cu, width, color, scan_order, 1, in_stride, 4, ref_in, pred_in, skip.rec, skip.coeff, false); skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, 4, 4); - skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * bit_cost; + skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * state->lambda; if (noskip.cost <= skip.cost) { *trskip_out = 0; From e000c7229fb1b5b3f210c1bf7a5c25da40cc24b3 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 18:54:08 +0200 Subject: [PATCH 004/135] Fix bit costs in search_pu_inter_ref a bit --- src/search_inter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 1b705e4f..3eb0f840 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1398,16 +1398,17 @@ static void search_pu_inter_ref(inter_search_info_t *info, } // Only check when candidates are different + uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = 0; if (!merged) { cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; } if (info->best_cost < *inter_cost) { // Map reference index to L0/L1 pictures cur_cu->inter.mv_dir = ref_list+1; - uint8_t mv_ref_coded = LX_idx; cur_cu->merged = merged; cur_cu->merge_idx = merge_idx; @@ -1418,7 +1419,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded; + *inter_bitcost = info->best_bitcost; } From 3265d45a4e5d8b69c5fa4a6617810b881130d8e9 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 29 Nov 2021 02:02:52 +0200 Subject: [PATCH 005/135] Temporarily remove FME threshold for verification purposes --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 3eb0f840..b2f4a765 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1363,7 +1363,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, break; } - if (cfg->fme_level > 0 && info->best_cost < *inter_cost) { + if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { search_frac(info); } else if (info->best_cost < MAX_DOUBLE) { From 936fb766852e669e88946802f64d58d46ee27fa9 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 26 Nov 2021 23:47:10 +0200 Subject: [PATCH 006/135] Remove merge candidate stuff from search_pu_inter_ref There is a separate merge analysis now --- src/search_inter.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index b2f4a765..f091c260 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1382,36 +1382,17 @@ static void search_pu_inter_ref(inter_search_info_t *info, mv = info->best_mv; - int merged = 0; - int merge_idx = 0; - // Check every candidate to find a match - for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (info->merge_cand[merge_idx].dir != 3 && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][0] == mv.x && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][1] == mv.y && - (uint32_t)info->state->frame->ref_LX[info->merge_cand[merge_idx].dir - 1][ - info->merge_cand[merge_idx].ref[info->merge_cand[merge_idx].dir - 1]] == info->ref_idx) - { - merged = 1; - break; - } - } - // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = 0; - if (!merged) { - cu_mv_cand = - select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); - info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; - } + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; if (info->best_cost < *inter_cost) { // Map reference index to L0/L1 pictures cur_cu->inter.mv_dir = ref_list+1; - cur_cu->merged = merged; - cur_cu->merge_idx = merge_idx; + cur_cu->merged = false; + cur_cu->skipped = false; cur_cu->inter.mv_ref[ref_list] = LX_idx; cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; @@ -1428,6 +1409,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); if (valid_mv) { // Map reference index to L0/L1 pictures + unipred_LX[ref_list].merged = false; + unipred_LX[ref_list].skipped = false; unipred_LX[ref_list].inter.mv_dir = ref_list + 1; unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx; unipred_LX[ref_list].inter.mv[ref_list][0] = (int16_t)mv.x; From 90c0a708a799ae01896d0c0943d0a8a104cd98b7 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 28 Nov 2021 23:40:16 +0200 Subject: [PATCH 007/135] Add new structs for storing statistics during the search. Use in AMVP search. --- src/search.c | 20 ++++++++++ src/search.h | 15 ++++++++ src/search_inter.c | 91 +++++++++++++++++++++++++++++++--------------- 3 files changed, 96 insertions(+), 30 deletions(-) diff --git a/src/search.c b/src/search.c index 4345ad75..385c4981 100644 --- a/src/search.c +++ b/src/search.c @@ -415,6 +415,7 @@ static double calc_mode_bits(const encoder_state_t *state, } +// TODO: replace usages of this by the kvz_sort_indices_by_cost function. /** * \brief Sort modes and costs to ascending order according to costs. */ @@ -439,6 +440,25 @@ void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t } +/** + * \brief Sort indices to ascending order according to costs. + */ +void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map) +{ + // Size of sorted arrays is expected to be "small". No need for faster algorithm. + for (uint8_t i = 1; i < map->size; ++i) { + const int8_t cur_idx = map->idx[i]; + const double cur_cost = map->stats[cur_idx].cost; + uint8_t j = i; + while (j > 0 && cur_cost < map->stats[map->idx[j - 1]].cost) { + map->idx[j] = map->idx[j - 1]; + --j; + } + map->idx[j] = cur_idx; + } +} + + static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) { vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) }; diff --git a/src/search.h b/src/search.h index 774a4d7b..fe6d7f5d 100644 --- a/src/search.h +++ b/src/search.h @@ -44,7 +44,22 @@ #include "image.h" #include "constraint.h" +typedef struct blk_stats_t { + + cu_info_t blk; // list of blocks + double cost; // list of RD costs + uint32_t bits; // list of bit costs +} blk_stats_t; + +typedef struct blk_stats_map_t { + + blk_stats_t *stats; // list of block statistics entries + int8_t *idx; // list of indices to block stats (to be sorted by costs) + int size; // number of active elements in the lists +} blk_stats_map_t; + void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); +void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index f091c260..d561387a 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1215,11 +1215,12 @@ static void apply_mv_scaling(int32_t current_poc, */ static void search_pu_inter_ref(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, + lcu_t *lcu, + cu_info_t *cur_cu, double *inter_cost, uint32_t *inter_bitcost, double *best_LX_cost, - cu_info_t *unipred_LX) + blk_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1409,15 +1410,23 @@ static void search_pu_inter_ref(inter_search_info_t *info, bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); if (valid_mv) { // Map reference index to L0/L1 pictures - unipred_LX[ref_list].merged = false; - unipred_LX[ref_list].skipped = false; - unipred_LX[ref_list].inter.mv_dir = ref_list + 1; - unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx; - unipred_LX[ref_list].inter.mv[ref_list][0] = (int16_t)mv.x; - unipred_LX[ref_list].inter.mv[ref_list][1] = (int16_t)mv.y; + blk_stats_map_t *cur_map = &amvp[ref_list]; + blk_stats_t *entry = &cur_map->stats[cur_map->size]; + cu_info_t *pb = &entry->blk; + pb->merged = false; + pb->skipped = false; + pb->inter.mv_dir = ref_list + 1; + pb->inter.mv_ref[ref_list] = LX_idx; + pb->inter.mv[ref_list][0] = (int16_t)mv.x; + pb->inter.mv[ref_list][1] = (int16_t)mv.y; - CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand); + CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + entry->cost = info->best_cost; + entry->bits = info->best_bitcost; + cur_map->size++; + + // TODO: remove (this is just to keep old functionality) best_LX_cost[ref_list] = info->best_cost; } } @@ -1669,6 +1678,7 @@ static void search_pu_inter(encoder_state_t * const state, mrg_costs[i] = MAX_DOUBLE; } + cu_info_t orig_cu = *cur_cu; int num_rdo_cands = 0; // Check motion vector constraints and perform rough search @@ -1765,16 +1775,31 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - cu_info_t unipreds[2]; + double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; // TODO: remove + blk_stats_t stats[2][MAX_REF_PIC_COUNT]; + int8_t idx[2][MAX_REF_PIC_COUNT]; + blk_stats_map_t amvp[2]; + + for (int ref_list = 0; ref_list < 2; ++ref_list) { + amvp[ref_list].stats = stats[ref_list]; + amvp[ref_list].idx = idx [ref_list]; + amvp[ref_list].size = 0; + for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { + amvp[ref_list].stats[i].blk = orig_cu; + amvp[ref_list].idx[i] = i; + } + } for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds); + search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, amvp); } + kvz_sort_indices_by_cost(&amvp[0]); + kvz_sort_indices_by_cost(&amvp[1]); + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -1792,15 +1817,21 @@ static void search_pu_inter(encoder_state_t * const state, inter_merge_cand_t *merge_cand = info.merge_cand; + int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; + cu_info_t *best_unipred[2] = { + &amvp[0].stats[best_idx[0]].blk, + &amvp[1].stats[best_idx[1]].blk + }; + int16_t mv[2][2]; - mv[0][0] = unipreds[0].inter.mv[0][0]; - mv[0][1] = unipreds[0].inter.mv[0][1]; - mv[1][0] = unipreds[1].inter.mv[1][0]; - mv[1][1] = unipreds[1].inter.mv[1][1]; + mv[0][0] = best_unipred[0]->inter.mv[0][0]; + mv[0][1] = best_unipred[0]->inter.mv[0][1]; + mv[1][0] = best_unipred[1]->inter.mv[1][0]; + mv[1][1] = best_unipred[1]->inter.mv[1][1]; kvz_inter_recon_bipred(info.state, - ref->images[ref_LX[0][unipreds[0].inter.mv_ref[0]]], - ref->images[ref_LX[1][unipreds[1].inter.mv_ref[1]]], + ref->images[ref_LX[0][best_unipred[0]->inter.mv_ref[0]]], + ref->images[ref_LX[1][best_unipred[1]->inter.mv_ref[1]]], x, y, width, height, @@ -1817,23 +1848,23 @@ static void search_pu_inter(encoder_state_t * const state, uint32_t bitcost[2] = { 0, 0 }; cost += info.mvd_cost_func(info.state, - unipreds[0].inter.mv[0][0], - unipreds[0].inter.mv[0][1], + best_unipred[0]->inter.mv[0][0], + best_unipred[0]->inter.mv[0][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[0]); cost += info.mvd_cost_func(info.state, - unipreds[1].inter.mv[1][0], - unipreds[1].inter.mv[1][1], + best_unipred[1]->inter.mv[1][0], + best_unipred[1]->inter.mv[1][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[1]); const uint8_t mv_ref_coded[2] = { - unipreds[0].inter.mv_ref[0], - unipreds[1].inter.mv_ref[1] + best_unipred[0]->inter.mv_ref[0], + best_unipred[1]->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info.state->lambda_sqrt * extra_bits; @@ -1841,13 +1872,13 @@ static void search_pu_inter(encoder_state_t * const state, if (cost < *inter_cost) { cur_cu->inter.mv_dir = 3; - cur_cu->inter.mv_ref[0] = unipreds[0].inter.mv_ref[0]; - cur_cu->inter.mv_ref[1] = unipreds[1].inter.mv_ref[1]; + cur_cu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + cur_cu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - cur_cu->inter.mv[0][0] = unipreds[0].inter.mv[0][0]; - cur_cu->inter.mv[0][1] = unipreds[0].inter.mv[0][1]; - cur_cu->inter.mv[1][0] = unipreds[1].inter.mv[1][0]; - cur_cu->inter.mv[1][1] = unipreds[1].inter.mv[1][1]; + cur_cu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; + cur_cu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; + cur_cu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; + cur_cu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; cur_cu->merged = 0; // Check every candidate to find a match From 2ed434e57bcd7bbe351d1d335db101c32420850a Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 29 Nov 2021 02:16:28 +0200 Subject: [PATCH 008/135] Remove now deprecated array --- src/search_inter.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index d561387a..514ca1a9 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1219,7 +1219,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, cu_info_t *cur_cu, double *inter_cost, uint32_t *inter_bitcost, - double *best_LX_cost, blk_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1406,29 +1405,24 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Update best unipreds for biprediction - if (info->best_cost < best_LX_cost[ref_list]) { - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { - // Map reference index to L0/L1 pictures - blk_stats_map_t *cur_map = &amvp[ref_list]; - blk_stats_t *entry = &cur_map->stats[cur_map->size]; - cu_info_t *pb = &entry->blk; - pb->merged = false; - pb->skipped = false; - pb->inter.mv_dir = ref_list + 1; - pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)mv.x; - pb->inter.mv[ref_list][1] = (int16_t)mv.y; + bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); + if (valid_mv) { + // Map reference index to L0/L1 pictures + blk_stats_map_t *cur_map = &amvp[ref_list]; + blk_stats_t *entry = &cur_map->stats[cur_map->size]; + cu_info_t *pb = &entry->blk; + pb->merged = false; + pb->skipped = false; + pb->inter.mv_dir = ref_list + 1; + pb->inter.mv_ref[ref_list] = LX_idx; + pb->inter.mv[ref_list][0] = (int16_t)mv.x; + pb->inter.mv[ref_list][1] = (int16_t)mv.y; - CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); - entry->cost = info->best_cost; - entry->bits = info->best_bitcost; - cur_map->size++; - - // TODO: remove (this is just to keep old functionality) - best_LX_cost[ref_list] = info->best_cost; - } + entry->cost = info->best_cost; + entry->bits = info->best_bitcost; + cur_map->size++; } } @@ -1775,7 +1769,6 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; // TODO: remove blk_stats_t stats[2][MAX_REF_PIC_COUNT]; int8_t idx[2][MAX_REF_PIC_COUNT]; blk_stats_map_t amvp[2]; @@ -1794,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state, info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, amvp); + search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, amvp); } kvz_sort_indices_by_cost(&amvp[0]); @@ -1808,7 +1801,7 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { // Try biprediction from valid acquired unipreds. - if (best_cost_LX[0] != MAX_DOUBLE && best_cost_LX[1] != MAX_DOUBLE) { + if (amvp[0].size > 0 && amvp[1].size > 0) { // TODO: logic is copy paste from search_pu_inter_bipred. // Get rid of duplicate code asap. From 1940f0880f1440ac15440adbb63c560b1baab4a7 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 29 Nov 2021 16:57:40 +0200 Subject: [PATCH 009/135] Add amvp unipredictions to both lists if reference picture is present --- src/search_inter.c | 350 ++++++++++++++++++++++----------------------- 1 file changed, 174 insertions(+), 176 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 514ca1a9..4257ff09 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1223,206 +1223,204 @@ static void search_pu_inter_ref(inter_search_info_t *info, { const kvz_config *cfg = &info->state->encoder_control->cfg; - // which list, L0 or L1, ref_idx is in and in what index - int8_t ref_list = -1; - // the index of the ref_idx in L0 or L1 list - int8_t LX_idx; - // max value of LX_idx plus one - const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0], - info->state->frame->ref_LX_size[1]); + // Reference picture might be in both lists + bool ref_list_active[2] = { false, false }; + // Reference picture indices in L0 and L1 lists + int8_t ref_list_idx[2] = { -1, -1 }; - for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++) - { - // check if ref_idx is in L0 - if (LX_idx < info->state->frame->ref_LX_size[0] && - info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) { - ref_list = 0; - break; - } - - // check if ref_idx is in L1 - if (LX_idx < info->state->frame->ref_LX_size[1] && - info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) { - ref_list = 1; - break; - } - } - // ref_idx has to be found in either L0 or L1 - assert(LX_idx < LX_IDX_MAX_PLUS_1); - - // store temp values to be stored back later - int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; - - // Get MV candidates - cur_cu->inter.mv_ref[ref_list] = LX_idx; - - kvz_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - cur_cu, - lcu, - ref_list); - - // store old values back - cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - - vector2d_t mv = { 0, 0 }; - - // Take starting point for MV search from previous frame. - // When temporal motion vector candidates are added, there is probably - // no point to this anymore, but for now it helps. - const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); - const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); - const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; - const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); - if (ref_cu->type == CU_INTER) { - vector2d_t mv_previous = { 0, 0 }; - if (ref_cu->inter.mv_dir & 1) { - mv_previous.x = ref_cu->inter.mv[0][0]; - mv_previous.y = ref_cu->inter.mv[0][1]; - } - else { - mv_previous.x = ref_cu->inter.mv[1][0]; - mv_previous.y = ref_cu->inter.mv[1][1]; - } - // Apply mv scaling if neighbor poc is available - if (info->state->frame->ref_LX_size[ref_list] > 0) { - // When there are reference pictures from the future (POC > current POC) - // in L0 or L1, the primary list for the colocated PU is the inverse of - // collocated_from_l0_flag. Otherwise it is equal to reflist. - // - // Kvazaar always sets collocated_from_l0_flag so the list is L1 when - // there are future references. - int col_list = ref_list; - for (int i = 0; i < info->state->frame->ref->used_size; i++) { - if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { - col_list = 1; - break; - } + // Check if ref picture is present in the lists + for (int ref_list = 0; ref_list < 2; ++ref_list) { + for (int i = 0; i < info->state->frame->ref_LX_size[ref_list]; ++i) { + if (info->state->frame->ref_LX[ref_list][i] == info->ref_idx) { + ref_list_active[ref_list] = true; + ref_list_idx[ref_list] = i; + break; } - if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { - // Use the other list if the colocated PU does not have a MV for the - // primary list. - col_list = 1 - col_list; - } - - uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; - // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument - apply_mv_scaling( - info->state->frame->poc, - info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], - info->state->frame->ref->pocs[neighbor_poc_index], - info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ - info->state->frame->ref->ref_LXs[neighbor_poc_index] - [col_list] - [ref_cu->inter.mv_ref[col_list]] - ], - &mv_previous - ); - } - - // Check if the mv is valid after scaling - if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - mv = mv_previous; } } - int search_range = 32; - switch (cfg->ime_algorithm) { - case KVZ_IME_FULL64: search_range = 64; break; - case KVZ_IME_FULL32: search_range = 32; break; - case KVZ_IME_FULL16: search_range = 16; break; - case KVZ_IME_FULL8: search_range = 8; break; - default: break; - } + // Must find at least one reference picture + assert(ref_list_active[0] || ref_list_active[1]); - info->best_cost = MAX_DOUBLE; + // TODO: remove + double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, mv); - break; + for (int ref_list = 1; ref_list >= 0; --ref_list) { + if (ref_list_active[ref_list]) { - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, mv); - break; + int LX_idx = ref_list_idx[ref_list]; - case KVZ_IME_DIA: - diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; + // store temp values to be stored back later + int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; - default: - hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; - } + // Get MV candidates + cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; - if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { - search_frac(info); - - } else if (info->best_cost < MAX_DOUBLE) { - // Recalculate inter cost with SATD. - info->best_cost = kvz_image_calc_satd( - info->state->tile->frame->source, - info->ref, + kvz_inter_get_mv_cand(info->state, info->origin.x, info->origin.y, - info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2), - info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), info->width, - info->height); - info->best_cost += info->best_bitcost * info->state->lambda_sqrt; - } + info->height, + info->mv_cand, + cur_cu, + lcu, + ref_list); - mv = info->best_mv; + // store old values back + cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - // Only check when candidates are different - uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); - info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + vector2d_t mv = { 0, 0 }; - if (info->best_cost < *inter_cost) { - // Map reference index to L0/L1 pictures - cur_cu->inter.mv_dir = ref_list+1; + // Take starting point for MV search from previous frame. + // When temporal motion vector candidates are added, there is probably + // no point to this anymore, but for now it helps. + const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); + const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); + const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; + const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); + if (ref_cu->type == CU_INTER) { + vector2d_t mv_previous = { 0, 0 }; + if (ref_cu->inter.mv_dir & 1) { + mv_previous.x = ref_cu->inter.mv[0][0]; + mv_previous.y = ref_cu->inter.mv[0][1]; + } else { + mv_previous.x = ref_cu->inter.mv[1][0]; + mv_previous.y = ref_cu->inter.mv[1][1]; + } + // Apply mv scaling if neighbor poc is available + if (info->state->frame->ref_LX_size[ref_list] > 0) { + // When there are reference pictures from the future (POC > current POC) + // in L0 or L1, the primary list for the colocated PU is the inverse of + // collocated_from_l0_flag. Otherwise it is equal to reflist. + // + // Kvazaar always sets collocated_from_l0_flag so the list is L1 when + // there are future references. + int col_list = ref_list; + for (int i = 0; i < info->state->frame->ref->used_size; i++) { + if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { + col_list = 1; + break; + } + } + if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { + // Use the other list if the colocated PU does not have a MV for the + // primary list. + col_list = 1 - col_list; + } - cur_cu->merged = false; - cur_cu->skipped = false; - cur_cu->inter.mv_ref[ref_list] = LX_idx; - cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; - cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; + uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; + // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument + apply_mv_scaling( + info->state->frame->poc, + info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], + info->state->frame->ref->pocs[neighbor_poc_index], + info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ + info->state->frame->ref->ref_LXs[neighbor_poc_index] + [col_list] + [ref_cu->inter.mv_ref[col_list]] + ], + &mv_previous + ); + } - CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); + // Check if the mv is valid after scaling + if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { + mv = mv_previous; + } + } - *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost; - } + int search_range = 32; + switch (cfg->ime_algorithm) { + case KVZ_IME_FULL64: search_range = 64; break; + case KVZ_IME_FULL32: search_range = 32; break; + case KVZ_IME_FULL16: search_range = 16; break; + case KVZ_IME_FULL8: search_range = 8; break; + default: break; + } + info->best_cost = MAX_DOUBLE; - // Update best unipreds for biprediction - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { - // Map reference index to L0/L1 pictures - blk_stats_map_t *cur_map = &amvp[ref_list]; - blk_stats_t *entry = &cur_map->stats[cur_map->size]; - cu_info_t *pb = &entry->blk; - pb->merged = false; - pb->skipped = false; - pb->inter.mv_dir = ref_list + 1; - pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)mv.x; - pb->inter.mv[ref_list][1] = (int16_t)mv.y; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, mv); + break; - CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, mv); + break; - entry->cost = info->best_cost; - entry->bits = info->best_bitcost; - cur_map->size++; + case KVZ_IME_DIA: + diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + break; + + default: + hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + break; + } + + if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { + search_frac(info); + + } else if (info->best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + info->best_cost = kvz_image_calc_satd( + info->state->tile->frame->source, + info->ref, + info->origin.x, + info->origin.y, + info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2), + info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), + info->width, + info->height); + info->best_cost += info->best_bitcost * info->state->lambda_sqrt; + } + + mv = info->best_mv; + + // Only check when candidates are different + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); + if (valid_mv) { + if (info->best_cost < *inter_cost) { + // Map reference index to L0/L1 pictures + cur_cu->inter.mv_dir = ref_list + 1; + + cur_cu->merged = false; + cur_cu->skipped = false; + cur_cu->inter.mv_ref[ref_list] = LX_idx; + cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; + cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; + CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); + + *inter_cost = info->best_cost; + *inter_bitcost = info->best_bitcost; + } + + // Map reference index to L0/L1 pictures + blk_stats_map_t *cur_map = &amvp[ref_list]; + blk_stats_t *entry = &cur_map->stats[cur_map->size]; + cu_info_t *pb = &entry->blk; + pb->merged = false; + pb->skipped = false; + pb->inter.mv_dir = ref_list + 1; + pb->inter.mv_ref[ref_list] = LX_idx; + pb->inter.mv[ref_list][0] = (int16_t)mv.x; + pb->inter.mv[ref_list][1] = (int16_t)mv.y; + CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + + entry->cost = info->best_cost; + entry->bits = info->best_bitcost; + cur_map->size++; + } + } } } From 48773b0d25e61f182ad843ef33b7a38f3c0197e0 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 30 Nov 2021 00:19:25 +0200 Subject: [PATCH 010/135] Replace and relocate deprecated cost and mode parameter tracking. --- src/search_inter.c | 52 +++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 4257ff09..7cbf882d 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1217,8 +1217,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, int depth, lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost, blk_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1242,10 +1240,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Must find at least one reference picture assert(ref_list_active[0] || ref_list_active[1]); - // TODO: remove - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - - for (int ref_list = 1; ref_list >= 0; --ref_list) { + for (int ref_list = 0; ref_list < 2; ++ref_list) { if (ref_list_active[ref_list]) { int LX_idx = ref_list_idx[ref_list]; @@ -1388,21 +1383,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Update best unipreds for biprediction bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { - if (info->best_cost < *inter_cost) { - // Map reference index to L0/L1 pictures - cur_cu->inter.mv_dir = ref_list + 1; - - cur_cu->merged = false; - cur_cu->skipped = false; - cur_cu->inter.mv_ref[ref_list] = LX_idx; - cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x; - cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y; - CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); - - *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost; - } + if (valid_mv && info->best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures blk_stats_map_t *cur_map = &amvp[ref_list]; @@ -1785,12 +1766,33 @@ static void search_pu_inter(encoder_state_t * const state, info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, amvp); + search_pu_inter_ref(&info, depth, lcu, cur_cu, amvp); } kvz_sort_indices_by_cost(&amvp[0]); kvz_sort_indices_by_cost(&amvp[1]); + int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; + double best_cost_L0 = MAX_DOUBLE; + double best_cost_L1 = MAX_DOUBLE; + if (amvp[0].size > 0) best_cost_L0 = amvp[0].stats[best_idx[0]].cost; + if (amvp[1].size > 0) best_cost_L1 = amvp[1].stats[best_idx[1]].cost; + int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; + int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; + + cu_info_t *best_unipred[2] = { + &amvp[0].stats[best_idx[0]].blk, + &amvp[1].stats[best_idx[1]].blk + }; + + // Set best valid unipred to cur_cu + if (best_cost < MAX_DOUBLE) { + // Map reference index to L0/L1 pictures + *cur_cu = *best_unipred[best_list]; + *inter_cost = amvp[best_list].stats[best_idx[best_list]].cost; + *inter_bitcost = amvp[best_list].stats[best_idx[best_list]].bits; + } + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -1808,12 +1810,6 @@ static void search_pu_inter(encoder_state_t * const state, inter_merge_cand_t *merge_cand = info.merge_cand; - int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; - cu_info_t *best_unipred[2] = { - &amvp[0].stats[best_idx[0]].blk, - &amvp[1].stats[best_idx[1]].blk - }; - int16_t mv[2][2]; mv[0][0] = best_unipred[0]->inter.mv[0][0]; mv[0][1] = best_unipred[0]->inter.mv[0][1]; From 94096dd1755618c38a024b73cc944b4650bdfce0 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 30 Nov 2021 00:34:34 +0200 Subject: [PATCH 011/135] Ignore merge candidates when computing AMVP motion vector costs. --- src/search_inter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 7cbf882d..474e3883 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -234,8 +234,8 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) info->state, x, y, 2, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcost ); @@ -1068,8 +1068,8 @@ static void search_frac(inter_search_info_t *info) costs[0] += info->mvd_cost_func(state, mv.x, mv.y, 2, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[0]); best_cost = costs[0]; @@ -1128,8 +1128,8 @@ static void search_frac(inter_search_info_t *info) mv.y + pattern[j]->y, mv_shift, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[j] ); From 8406942d06fd5885d4fc1794cfe1591fc6aab036 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 30 Nov 2021 19:15:36 +0200 Subject: [PATCH 012/135] Improve the new data structure a bit. Use also for merge candidates. --- src/search.c | 12 +++--- src/search.h | 29 +++++++------ src/search_inter.c | 103 +++++++++++++++++++++------------------------ 3 files changed, 70 insertions(+), 74 deletions(-) diff --git a/src/search.c b/src/search.c index 385c4981..c0f32034 100644 --- a/src/search.c +++ b/src/search.c @@ -443,18 +443,18 @@ void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t /** * \brief Sort indices to ascending order according to costs. */ -void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map) +void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map) { // Size of sorted arrays is expected to be "small". No need for faster algorithm. for (uint8_t i = 1; i < map->size; ++i) { - const int8_t cur_idx = map->idx[i]; - const double cur_cost = map->stats[cur_idx].cost; + const int8_t cur_indx = map->indx[i]; + const double cur_cost = map->cost[cur_indx]; uint8_t j = i; - while (j > 0 && cur_cost < map->stats[map->idx[j - 1]].cost) { - map->idx[j] = map->idx[j - 1]; + while (j > 0 && cur_cost < map->cost[map->indx[j - 1]]) { + map->indx[j] = map->indx[j - 1]; --j; } - map->idx[j] = cur_idx; + map->indx[j] = cur_indx; } } diff --git a/src/search.h b/src/search.h index fe6d7f5d..9617e7b9 100644 --- a/src/search.h +++ b/src/search.h @@ -44,22 +44,27 @@ #include "image.h" #include "constraint.h" -typedef struct blk_stats_t { - cu_info_t blk; // list of blocks - double cost; // list of RD costs - uint32_t bits; // list of bit costs -} blk_stats_t; + /** + * \brief Data collected during search processes. + * + * The intended use is to collect statistics of the + * searched coding/prediction units. Data related to + * a specific unit is found at index i. The arrays + * should be indexed by elements of the "indx" array + * that will be sorted by the RD costs of the units. + */ +typedef struct unit_stats_map_t { -typedef struct blk_stats_map_t { - - blk_stats_t *stats; // list of block statistics entries - int8_t *idx; // list of indices to block stats (to be sorted by costs) - int size; // number of active elements in the lists -} blk_stats_map_t; + cu_info_t unit[MAX_REF_PIC_COUNT]; //!< list of searched units + double cost[MAX_REF_PIC_COUNT]; //!< list of matching RD costs + uint32_t bits[MAX_REF_PIC_COUNT]; //!< list of matching bit costs + int8_t indx[MAX_REF_PIC_COUNT]; //!< list of indices to elements in the other arrays + int size; //!< number of active elements in the lists +} unit_stats_map_t; void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); -void kvz_sort_indices_by_cost(blk_stats_map_t *__restrict map); +void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index 474e3883..e8272b2b 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1217,7 +1217,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, int depth, lcu_t *lcu, cu_info_t *cur_cu, - blk_stats_map_t *amvp) + unit_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; @@ -1386,9 +1386,9 @@ static void search_pu_inter_ref(inter_search_info_t *info, if (valid_mv && info->best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures - blk_stats_map_t *cur_map = &amvp[ref_list]; - blk_stats_t *entry = &cur_map->stats[cur_map->size]; - cu_info_t *pb = &entry->blk; + unit_stats_map_t *cur_map = &amvp[ref_list]; + int entry = cur_map->size; + cu_info_t *pb = &cur_map->unit[entry]; pb->merged = false; pb->skipped = false; pb->inter.mv_dir = ref_list + 1; @@ -1397,8 +1397,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, pb->inter.mv[ref_list][1] = (int16_t)mv.y; CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); - entry->cost = info->best_cost; - entry->bits = info->best_bitcost; + cur_map->cost[entry] = info->best_cost; + cur_map->bits[entry] = info->best_bitcost; cur_map->size++; } } @@ -1643,16 +1643,14 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_cu, 0, 0); CU_SET_MV_CAND(cur_cu, 1, 0); - // Merge Analysis starts here - int8_t mrg_cands[MRG_MAX_NUM_CANDS]; - double mrg_costs[MRG_MAX_NUM_CANDS]; - for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - mrg_cands[i] = -1; - mrg_costs[i] = MAX_DOUBLE; - } - cu_info_t orig_cu = *cur_cu; - int num_rdo_cands = 0; + + // Merge Analysis starts here + unit_stats_map_t merge = { .size = 0 }; + for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { + merge.indx[i] = -1; + merge.cost[i] = MAX_DOUBLE; + } // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { @@ -1672,8 +1670,8 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - mrg_cands, - num_rdo_cands); + merge.indx, + merge.size); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list @@ -1687,23 +1685,29 @@ static void search_pu_inter(encoder_state_t * const state, } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - mrg_costs[num_rdo_cands] = kvz_satd_any_size(width, height, + + merge.cost[merge.size] = kvz_satd_any_size(width, height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); // Add cost of coding the merge index - mrg_costs[num_rdo_cands] += merge_idx * info.state->lambda_sqrt; + merge.cost[merge.size] += merge_idx * info.state->lambda_sqrt; + merge.bits[merge.size] = merge_idx; + merge.indx[merge.size] = merge.size; - mrg_cands[num_rdo_cands] = merge_idx; - num_rdo_cands++; + merge.unit[merge.size] = *cur_cu; + merge.unit[merge.size].type = CU_INTER; + merge.unit[merge.size].merge_idx = merge_idx; + merge.unit[merge.size].merged = true; + merge.unit[merge.size].skipped = false; + + merge.size++; } - // Sort candidates by cost - kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands); + kvz_sort_indices_by_cost(&merge); - // Limit by availability - // TODO: Do not limit to just 1 - num_rdo_cands = MIN(1, num_rdo_cands); + // Try early skip decision on just one merge candidate if available + int num_rdo_cands = MIN(1, merge.size); // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1714,7 +1718,7 @@ static void search_pu_inter(encoder_state_t * const state, // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = mrg_cands[merge_rdo_idx]; + int merge_idx = merge.unit[merge.indx[merge_rdo_idx]].merge_idx; cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; @@ -1748,17 +1752,12 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - blk_stats_t stats[2][MAX_REF_PIC_COUNT]; - int8_t idx[2][MAX_REF_PIC_COUNT]; - blk_stats_map_t amvp[2]; + unit_stats_map_t amvp[2] = { { .size = 0 }, { .size = 0 } }; for (int ref_list = 0; ref_list < 2; ++ref_list) { - amvp[ref_list].stats = stats[ref_list]; - amvp[ref_list].idx = idx [ref_list]; - amvp[ref_list].size = 0; for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { - amvp[ref_list].stats[i].blk = orig_cu; - amvp[ref_list].idx[i] = i; + amvp[ref_list].unit[i] = orig_cu; // TODO: only initialize what is necessary + amvp[ref_list].indx[i] = i; } } @@ -1772,25 +1771,25 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_indices_by_cost(&amvp[0]); kvz_sort_indices_by_cost(&amvp[1]); - int best_idx[2] = { amvp[0].idx[0], amvp[1].idx[0] }; + int best_idx[2] = { amvp[0].indx[0], amvp[1].indx[0] }; double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; - if (amvp[0].size > 0) best_cost_L0 = amvp[0].stats[best_idx[0]].cost; - if (amvp[1].size > 0) best_cost_L1 = amvp[1].stats[best_idx[1]].cost; + if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_idx[0]]; + if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_idx[1]]; int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; cu_info_t *best_unipred[2] = { - &amvp[0].stats[best_idx[0]].blk, - &amvp[1].stats[best_idx[1]].blk + &amvp[0].unit[best_idx[0]], + &amvp[1].unit[best_idx[1]] }; // Set best valid unipred to cur_cu if (best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures *cur_cu = *best_unipred[best_list]; - *inter_cost = amvp[best_list].stats[best_idx[best_list]].cost; - *inter_bitcost = amvp[best_list].stats[best_idx[best_list]].bits; + *inter_cost = amvp[best_list].cost[best_idx[best_list]]; + *inter_bitcost = amvp[best_list].bits[best_idx[best_list]]; } // Search bi-pred positions @@ -1907,21 +1906,13 @@ static void search_pu_inter(encoder_state_t * const state, } // Compare best merge cost to amvp cost - if (mrg_costs[0] < *inter_cost) { - *inter_cost = mrg_costs[0]; + int best_merge_indx = merge.indx[0]; + int best_merge_cost = merge.cost[best_merge_indx]; + + if (merge.size > 0 && best_merge_cost < *inter_cost) { + *inter_cost = best_merge_cost; *inter_bitcost = 0; // TODO: Check this - int merge_idx = mrg_cands[0]; - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; - cur_cu->merged = true; - cur_cu->skipped = false; + *cur_cu = merge.unit[best_merge_indx]; } if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { From aca91920545df23b2d28e7cee39d7aa596368424 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 20:10:36 +0200 Subject: [PATCH 013/135] Move cu_info_t initializations to search_pu_inter. Rename cur_cu cur_pu. --- src/search_inter.c | 131 ++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 67 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index e8272b2b..0c079a42 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1617,7 +1617,11 @@ static void search_pu_inter(encoder_state_t * const state, const int x_local = SUB_SCU(x); const int y_local = SUB_SCU(y); - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_NOTSET; + cur_pu->part_size = part_mode; + cur_pu->depth = depth; + cur_pu->qp = state->qp; inter_search_info_t info = { .state = state, @@ -1640,10 +1644,8 @@ static void search_pu_inter(encoder_state_t * const state, ); // Default to candidate 0 - CU_SET_MV_CAND(cur_cu, 0, 0); - CU_SET_MV_CAND(cur_cu, 1, 0); - - cu_info_t orig_cu = *cur_cu; + CU_SET_MV_CAND(cur_pu, 0, 0); + CU_SET_MV_CAND(cur_pu, 1, 0); // Merge Analysis starts here unit_stats_map_t merge = { .size = 0 }; @@ -1656,18 +1658,18 @@ static void search_pu_inter(encoder_state_t * const state, for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; - cur_cu->inter.mv_dir = cur_cand->dir; - cur_cu->inter.mv_ref[0] = cur_cand->ref[0]; - cur_cu->inter.mv_ref[1] = cur_cand->ref[1]; - cur_cu->inter.mv[0][0] = cur_cand->mv[0][0]; - cur_cu->inter.mv[0][1] = cur_cand->mv[0][1]; - cur_cu->inter.mv[1][0] = cur_cand->mv[1][0]; - cur_cu->inter.mv[1][1] = cur_cand->mv[1][1]; + cur_pu->inter.mv_dir = cur_cand->dir; + cur_pu->inter.mv_ref[0] = cur_cand->ref[0]; + cur_pu->inter.mv_ref[1] = cur_cand->ref[1]; + cur_pu->inter.mv[0][0] = cur_cand->mv[0][0]; + cur_pu->inter.mv[0][1] = cur_cand->mv[0][1]; + cur_pu->inter.mv[1][0] = cur_cand->mv[1][0]; + cur_pu->inter.mv[1][1] = cur_cand->mv[1][1]; // If bipred is not enabled, do not try candidates with mv_dir == 3. // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. - if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; - if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue; + if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; + if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, merge.indx, @@ -1675,10 +1677,10 @@ static void search_pu_inter(encoder_state_t * const state, // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list - bool active_L0 = cur_cu->inter.mv_dir & 1; - bool active_L1 = cur_cu->inter.mv_dir & 2; - if (active_L0 && !fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || - active_L1 && !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || + bool active_L0 = cur_pu->inter.mv_dir & 1; + bool active_L1 = cur_pu->inter.mv_dir & 2; + if (active_L0 && !fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || + active_L1 && !fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || is_duplicate) { continue; @@ -1695,7 +1697,7 @@ static void search_pu_inter(encoder_state_t * const state, merge.bits[merge.size] = merge_idx; merge.indx[merge.size] = merge.size; - merge.unit[merge.size] = *cur_cu; + merge.unit[merge.size] = *cur_pu; merge.unit[merge.size].type = CU_INTER; merge.unit[merge.size].merge_idx = merge_idx; merge.unit[merge.size].merged = true; @@ -1711,7 +1713,7 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { // Reconstruct blocks with merge candidate. @@ -1719,27 +1721,27 @@ static void search_pu_inter(encoder_state_t * const state, // and chroma exists. // Early terminate if merge candidate with zero CBF is found. int merge_idx = merge.unit[merge.indx[merge_rdo_idx]].merge_idx; - cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); kvz_inter_recon_cu(state, lcu, x, y, width, true, false); - kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); - if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) { + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { continue; } else if (has_chroma) { kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true); - if (!cbf_is_set_any(cur_cu->cbf, depth)) { - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->skipped = true; + kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_INTER; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; *inter_cost = 0.0; // TODO: Check this *inter_bitcost = merge_idx; // TODO: Check this return; @@ -1756,7 +1758,7 @@ static void search_pu_inter(encoder_state_t * const state, for (int ref_list = 0; ref_list < 2; ++ref_list) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { - amvp[ref_list].unit[i] = orig_cu; // TODO: only initialize what is necessary + amvp[ref_list].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[ref_list].indx[i] = i; } } @@ -1765,7 +1767,7 @@ static void search_pu_inter(encoder_state_t * const state, info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, amvp); + search_pu_inter_ref(&info, depth, lcu, cur_pu, amvp); } kvz_sort_indices_by_cost(&amvp[0]); @@ -1787,7 +1789,7 @@ static void search_pu_inter(encoder_state_t * const state, // Set best valid unipred to cur_cu if (best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures - *cur_cu = *best_unipred[best_list]; + *cur_pu = *best_unipred[best_list]; *inter_cost = amvp[best_list].cost[best_idx[best_list]]; *inter_bitcost = amvp[best_list].bits[best_idx[best_list]]; } @@ -1856,42 +1858,42 @@ static void search_pu_inter(encoder_state_t * const state, cost += info.state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; + cur_pu->inter.mv_dir = 3; - cur_cu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; - cur_cu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; + cur_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + cur_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - cur_cu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; - cur_cu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; - cur_cu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; - cur_cu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; - cur_cu->merged = 0; + cur_pu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; + cur_pu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; + cur_pu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; + cur_pu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; + cur_pu->merged = 0; // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) + if (merge_cand[merge_idx].mv[0][0] == cur_pu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == cur_pu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == cur_pu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == cur_pu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == cur_pu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == cur_pu->inter.mv_ref[1]) { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; + cur_pu->merged = 1; + cur_pu->merge_idx = merge_idx; break; } } // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist); + kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_pu, lcu, reflist); int cu_mv_cand = select_mv_cand( info.state, info.mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], + cur_pu->inter.mv[reflist][0], + cur_pu->inter.mv[reflist][1], NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); + CU_SET_MV_CAND(cur_pu, reflist, cu_mv_cand); } *inter_cost = cost; @@ -1901,7 +1903,7 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: this probably should have a separate command line option if (cfg->rdo == 3) { - search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost); + search_pu_inter_bipred(&info, depth, lcu, cur_pu, inter_cost, inter_bitcost); } } @@ -1912,11 +1914,11 @@ static void search_pu_inter(encoder_state_t * const state, if (merge.size > 0 && best_merge_cost < *inter_cost) { *inter_cost = best_merge_cost; *inter_bitcost = 0; // TODO: Check this - *cur_cu = merge.unit[best_merge_indx]; + *cur_pu = merge.unit[best_merge_indx]; } - if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { - assert(fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1])); + if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } } @@ -2043,12 +2045,6 @@ void kvz_search_cu_smp(encoder_state_t * const state, const int y_pu = PU_GET_Y(part_mode, width, y_local, i); const int width_pu = PU_GET_W(part_mode, width, i); const int height_pu = PU_GET_H(part_mode, width, i); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); - - cur_pu->type = CU_INTER; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->qp = state->qp; double cost = MAX_DOUBLE; uint32_t bitcost = MAX_INT; @@ -2065,6 +2061,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost += cost; *inter_bitcost += bitcost; + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); From 5edb82648a0deb6bb3add740d15c3e4f332e3c0c Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 20:20:40 +0200 Subject: [PATCH 014/135] More intuitive logic for computing RD costs and bit costs for SMP --- src/search_inter.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 0c079a42..406c6de2 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2071,15 +2071,6 @@ void kvz_search_cu_smp(encoder_state_t * const state, } } - // Calculate more accurate cost when needed - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); - } - // Count bits spent for coding the partition mode. int smp_extra_bits = 1; // horizontal or vertical if (state->encoder_control->cfg.amp_enable) { @@ -2092,6 +2083,16 @@ void kvz_search_cu_smp(encoder_state_t * const state, // coding the CBF. smp_extra_bits += 6; - *inter_cost += (state->encoder_control->cfg.rdo >= 2 ? state->lambda : state->lambda_sqrt) * smp_extra_bits; *inter_bitcost += smp_extra_bits; + + // Calculate more accurate cost when needed + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, + x, y, depth, + lcu, + inter_cost, + inter_bitcost); + } else { + *inter_cost += state->lambda_sqrt * smp_extra_bits; + } } From 9905cd42d6acee45f827bd0ca414f0b65190bfe4 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 20:23:21 +0200 Subject: [PATCH 015/135] Rename "indx" to "keys". There are too many "indices" already. --- src/search.c | 12 ++++++------ src/search.h | 6 +++--- src/search_inter.c | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/search.c b/src/search.c index c0f32034..8226e6d8 100644 --- a/src/search.c +++ b/src/search.c @@ -441,20 +441,20 @@ void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t /** - * \brief Sort indices to ascending order according to costs. + * \brief Sort keys (indices) to ascending order according to costs. */ -void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map) +void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map) { // Size of sorted arrays is expected to be "small". No need for faster algorithm. for (uint8_t i = 1; i < map->size; ++i) { - const int8_t cur_indx = map->indx[i]; + const int8_t cur_indx = map->keys[i]; const double cur_cost = map->cost[cur_indx]; uint8_t j = i; - while (j > 0 && cur_cost < map->cost[map->indx[j - 1]]) { - map->indx[j] = map->indx[j - 1]; + while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) { + map->keys[j] = map->keys[j - 1]; --j; } - map->indx[j] = cur_indx; + map->keys[j] = cur_indx; } } diff --git a/src/search.h b/src/search.h index 9617e7b9..de34755b 100644 --- a/src/search.h +++ b/src/search.h @@ -51,7 +51,7 @@ * The intended use is to collect statistics of the * searched coding/prediction units. Data related to * a specific unit is found at index i. The arrays - * should be indexed by elements of the "indx" array + * should be indexed by elements of the "keys" array * that will be sorted by the RD costs of the units. */ typedef struct unit_stats_map_t { @@ -59,12 +59,12 @@ typedef struct unit_stats_map_t { cu_info_t unit[MAX_REF_PIC_COUNT]; //!< list of searched units double cost[MAX_REF_PIC_COUNT]; //!< list of matching RD costs uint32_t bits[MAX_REF_PIC_COUNT]; //!< list of matching bit costs - int8_t indx[MAX_REF_PIC_COUNT]; //!< list of indices to elements in the other arrays + int8_t keys[MAX_REF_PIC_COUNT]; //!< list of keys (indices) to elements in the other arrays int size; //!< number of active elements in the lists } unit_stats_map_t; void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); -void kvz_sort_indices_by_cost(unit_stats_map_t *__restrict map); +void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index 406c6de2..cab20882 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1650,7 +1650,7 @@ static void search_pu_inter(encoder_state_t * const state, // Merge Analysis starts here unit_stats_map_t merge = { .size = 0 }; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - merge.indx[i] = -1; + merge.keys[i] = -1; merge.cost[i] = MAX_DOUBLE; } @@ -1672,7 +1672,7 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - merge.indx, + merge.keys, merge.size); // Don't try merge candidates that don't satisfy mv constraints. @@ -1695,7 +1695,7 @@ static void search_pu_inter(encoder_state_t * const state, // Add cost of coding the merge index merge.cost[merge.size] += merge_idx * info.state->lambda_sqrt; merge.bits[merge.size] = merge_idx; - merge.indx[merge.size] = merge.size; + merge.keys[merge.size] = merge.size; merge.unit[merge.size] = *cur_pu; merge.unit[merge.size].type = CU_INTER; @@ -1706,7 +1706,7 @@ static void search_pu_inter(encoder_state_t * const state, merge.size++; } - kvz_sort_indices_by_cost(&merge); + kvz_sort_keys_by_cost(&merge); // Try early skip decision on just one merge candidate if available int num_rdo_cands = MIN(1, merge.size); @@ -1720,7 +1720,7 @@ static void search_pu_inter(encoder_state_t * const state, // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge.unit[merge.indx[merge_rdo_idx]].merge_idx; + int merge_idx = merge.unit[merge.keys[merge_rdo_idx]].merge_idx; cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; @@ -1759,7 +1759,7 @@ static void search_pu_inter(encoder_state_t * const state, for (int ref_list = 0; ref_list < 2; ++ref_list) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { amvp[ref_list].unit[i] = *cur_pu; // TODO: only initialize what is necessary - amvp[ref_list].indx[i] = i; + amvp[ref_list].keys[i] = i; } } @@ -1770,10 +1770,10 @@ static void search_pu_inter(encoder_state_t * const state, search_pu_inter_ref(&info, depth, lcu, cur_pu, amvp); } - kvz_sort_indices_by_cost(&amvp[0]); - kvz_sort_indices_by_cost(&amvp[1]); + kvz_sort_keys_by_cost(&amvp[0]); + kvz_sort_keys_by_cost(&amvp[1]); - int best_idx[2] = { amvp[0].indx[0], amvp[1].indx[0] }; + int best_idx[2] = { amvp[0].keys[0], amvp[1].keys[0] }; double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_idx[0]]; @@ -1908,7 +1908,7 @@ static void search_pu_inter(encoder_state_t * const state, } // Compare best merge cost to amvp cost - int best_merge_indx = merge.indx[0]; + int best_merge_indx = merge.keys[0]; int best_merge_cost = merge.cost[best_merge_indx]; if (merge.size > 0 && best_merge_cost < *inter_cost) { From d28c2295dc59902fe309391a4062ab3a1f1c0cf1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Thu, 2 Dec 2021 22:01:16 +0200 Subject: [PATCH 016/135] The best_* fields are no longer used to track anything. Convert costs to double. --- src/search_inter.c | 242 +++++++++++++++++++++++++-------------------- 1 file changed, 134 insertions(+), 108 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index cab20882..ba007022 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -78,19 +78,6 @@ typedef struct { kvz_mvd_cost_func *mvd_cost_func; - /** - * \brief Best motion vector among the ones tested so far - */ - vector2d_t best_mv; - /** - * \brief Cost of best_mv - */ - double best_cost; - /** - * \brief Bit cost of best_mv - */ - uint32_t best_bitcost; - /** * \brief Possible optimized SAD implementation for the width, leave as * NULL for arbitrary-width blocks @@ -203,20 +190,25 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int /** * \brief Calculate cost for an integer motion vector. * - * Updates info->best_mv, info->best_cost and info->best_bitcost to the new + * Updates best_mv, best_cost and best_bitcost to the new * motion vector if it yields a lower cost than the current one. * * If the motion vector violates the MV constraints for tiles or WPP, the * cost is not set. * - * \return true if info->best_mv was changed, false otherwise + * \return true if best_mv was changed, false otherwise */ -static bool check_mv_cost(inter_search_info_t *info, int x, int y) +static bool check_mv_cost(inter_search_info_t *info, + int x, + int y, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { if (!intmv_within_tile(info, x, y)) return false; uint32_t bitcost = 0; - uint32_t cost = kvz_image_calc_sad( + double cost = kvz_image_calc_sad( info->pic, info->ref, info->origin.x, @@ -228,7 +220,7 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) info->optimized_sad ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; cost += info->mvd_cost_func( info->state, @@ -240,13 +232,13 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) &bitcost ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; // Set to motion vector in quarter pixel precision. - info->best_mv.x = x * 4; - info->best_mv.y = y * 4; - info->best_cost = cost; - info->best_bitcost = bitcost; + best_mv->x = x * 4; + best_mv->y = y * 4; + *best_cost = cost; + *best_bits = bitcost; return true; } @@ -297,12 +289,16 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv) * \brief Select starting point for integer motion estimation search. * * Checks the zero vector, extra_mv and merge candidates and updates - * info->best_mv to the best one. + * best_mv to the best one. */ -static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv) +static void select_starting_point(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. - check_mv_cost(info, 0, 0); + check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv); // Change to integer precision. extra_mv.x >>= 2; @@ -310,7 +306,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv // Check mv_in if it's not one of the merge candidates. if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) { - check_mv_cost(info, extra_mv.x, extra_mv.y); + check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); } // Go through candidates @@ -322,7 +318,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv if (x == 0 && y == 0) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } @@ -432,14 +428,17 @@ static double calc_mvd_cost(const encoder_state_t *state, } -static bool early_terminate(inter_search_info_t *info) +static bool early_terminate(inter_search_info_t *info, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { static const vector2d_t small_hexbs[7] = { { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, { 0, -1 }, { -1, 0 }, { 0, 0 }, }; - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; int first_index = 0; int last_index = 3; @@ -449,9 +448,9 @@ static bool early_terminate(inter_search_info_t *info) if (info->state->encoder_control->cfg.me_early_termination == KVZ_ME_EARLY_TERMINATION_SENSITIVE) { - threshold = info->best_cost * 0.95; + threshold = *best_cost * 0.95; } else { - threshold = info->best_cost; + threshold = *best_cost; } int best_index = 6; @@ -459,7 +458,7 @@ static bool early_terminate(inter_search_info_t *info) int x = mv.x + small_hexbs[i].x; int y = mv.y + small_hexbs[i].y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -469,7 +468,7 @@ static bool early_terminate(inter_search_info_t *info) mv.y += small_hexbs[best_index].y; // If best match is not better than threshold, we stop the search. - if (info->best_cost >= threshold) { + if (*best_cost >= threshold) { return true; } @@ -484,7 +483,10 @@ void kvz_tz_pattern_search(inter_search_info_t *info, unsigned pattern_type, const int iDist, vector2d_t mv, - int *best_dist) + int *best_dist, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { assert(pattern_type < 4); @@ -586,7 +588,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info, int x = mv.x + offset.x; int y = mv.y + offset.y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -599,20 +601,27 @@ void kvz_tz_pattern_search(inter_search_info_t *info, void kvz_tz_raster_search(inter_search_info_t *info, int iSearchRange, - int iRaster) + int iRaster, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { - const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + const vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; //compute SAD values for every point in the iRaster downsampled version of the current search area for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) { for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) { - check_mv_cost(info, mv.x + x, mv.y + y); + check_mv_cost(info, mv.x + x, mv.y + y, best_cost, best_bits, best_mv); } } } -static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) +static void tz_search(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { //TZ parameters const int iSearchRange = 96; // search range for each stage @@ -624,25 +633,25 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - info->best_cost = MAX_DOUBLE; + *best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); + select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); // Check if we should stop search if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) + early_terminate(info, best_cost, best_bits, best_mv)) { return; } - vector2d_t start = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 }; // step 2, grid search int rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); // Break the loop if the last three rounds didn't produce a better MV. if (best_dist != iDist) rounds_without_improvement++; @@ -655,7 +664,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) start.y = 0; rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); if (best_dist != iDist) rounds_without_improvement++; if (rounds_without_improvement >= 3) break; @@ -665,7 +674,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //step 3, raster scan if (use_raster_scan && best_dist > iRaster) { best_dist = iRaster; - kvz_tz_raster_search(info, iSearchRange, iRaster); + kvz_tz_raster_search(info, iSearchRange, iRaster, best_cost, best_bits, best_mv); } //step 4 @@ -673,19 +682,19 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //raster refinement if (use_raster_refinement && best_dist > 0) { for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) { - start.x = info->best_mv.x >> 2; - start.y = info->best_mv.y >> 2; - kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + start.x = best_mv->x >> 2; + start.y = best_mv->y >> 2; + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } //star refinement (repeat step 2 for the current starting point) while (use_star_refinement && best_dist > 0) { best_dist = 0; - start.x = info->best_mv.x >> 2; - start.y = info->best_mv.y >> 2; + start.x = best_mv->x >> 2; + start.y = best_mv->y >> 2; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } } @@ -707,7 +716,12 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. */ -static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void hexagon_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -732,27 +746,27 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - info->best_cost = MAX_DOUBLE; + *best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); + select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); // Check if we should stop search if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) + early_terminate(info, best_cost, best_bits, best_mv)) { return; } - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. int best_index = 0; // Search the initial 7 points of the hexagon. for (int i = 1; i < 7; ++i) { - if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y)) { + if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -781,7 +795,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Iterate through the next 3 points. for (int i = 0; i < 3; ++i) { vector2d_t offset = large_hexbs[start + i]; - if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) { + if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) { best_index = start + i; } } @@ -793,7 +807,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Do the final step of the search with a small pattern. for (int i = 1; i < 9; ++i) { - check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y); + check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv); } } @@ -813,7 +827,12 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. **/ -static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void diamond_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { enum diapos { DIA_UP = 0, @@ -832,28 +851,28 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 {0, 0} }; - info->best_cost = MAX_DOUBLE; + *best_cost = MAX_DOUBLE; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); + select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); // Check if we should stop search if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) + early_terminate(info, best_cost, best_bits, best_mv)) { return; } // current motion vector - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // current best index enum diapos best_index = DIA_CENTER; // initial search of the points of the diamond for (int i = 0; i < 5; ++i) { - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -883,7 +902,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // this is where we came from so it's checked already if (i == from_dir) continue; - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; better_found = 1; } @@ -905,12 +924,15 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 static void search_mv_full(inter_search_info_t *info, int32_t search_range, - vector2d_t extra_mv) + vector2d_t extra_mv, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // Search around the 0-vector. for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } @@ -922,7 +944,7 @@ static void search_mv_full(inter_search_info_t *info, if (!mv_in_merge(info, extra_mv)) { for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, extra_mv.x + x, extra_mv.y + y); + check_mv_cost(info, extra_mv.x + x, extra_mv.y + y, best_cost, best_bits, best_mv); } } } @@ -969,7 +991,7 @@ static void search_mv_full(inter_search_info_t *info, } if (already_tested) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } } @@ -982,7 +1004,10 @@ static void search_mv_full(inter_search_info_t *info, * Algoritm first searches 1/2-pel positions around integer mv and after best match is found, * refines the search by searching best 1/4-pel postion around best 1/2-pel position. */ -static void search_frac(inter_search_info_t *info) +static void search_frac(inter_search_info_t *info, + double *best_cost, + uint32_t *best_bits, + vector2d_t *best_mv) { // Map indexes to relative coordinates in the following way: // 5 3 6 @@ -995,10 +1020,10 @@ static void search_frac(inter_search_info_t *info) }; // Set mv to pixel precision - vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; - double best_cost = MAX_DOUBLE; - uint32_t best_bitcost = 0; + double cost = MAX_DOUBLE; + uint32_t bitcost = 0; uint32_t bitcosts[4] = { 0 }; unsigned best_index = 0; @@ -1072,8 +1097,8 @@ static void search_frac(inter_search_info_t *info) 0, info->ref_idx, &bitcosts[0]); - best_cost = costs[0]; - best_bitcost = bitcosts[0]; + cost = costs[0]; + bitcost = bitcosts[0]; //Set mv to half-pixel precision mv.x *= 2; @@ -1137,9 +1162,9 @@ static void search_frac(inter_search_info_t *info) } for (int j = 0; j < 4; ++j) { - if (within_tile[j] && costs[j] < best_cost) { - best_cost = costs[j]; - best_bitcost = bitcosts[j]; + if (within_tile[j] && costs[j] < cost) { + cost = costs[j]; + bitcost = bitcosts[j]; best_index = i + j; } } @@ -1165,9 +1190,9 @@ static void search_frac(inter_search_info_t *info) } } - info->best_mv = mv; - info->best_cost = best_cost; - info->best_bitcost = best_bitcost; + *best_mv = mv; + *best_cost = cost; + *best_bits = bitcost; } /** @@ -1264,7 +1289,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // store old values back cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - vector2d_t mv = { 0, 0 }; + vector2d_t best_mv = { 0, 0 }; // Take starting point for MV search from previous frame. // When temporal motion vector candidates are added, there is probably @@ -1320,7 +1345,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Check if the mv is valid after scaling if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - mv = mv_previous; + best_mv = mv_previous; } } @@ -1333,11 +1358,12 @@ static void search_pu_inter_ref(inter_search_info_t *info, default: break; } - info->best_cost = MAX_DOUBLE; + double best_cost = MAX_DOUBLE; + uint32_t best_bits = MAX_INT; switch (cfg->ime_algorithm) { case KVZ_IME_TZ: - tz_search(info, mv); + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); break; case KVZ_IME_FULL64: @@ -1345,45 +1371,45 @@ static void search_pu_inter_ref(inter_search_info_t *info, case KVZ_IME_FULL16: case KVZ_IME_FULL8: case KVZ_IME_FULL: - search_mv_full(info, search_range, mv); + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); break; case KVZ_IME_DIA: - diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); break; default: - hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); break; } - if (cfg->fme_level > 0 && info->best_cost < MAX_DOUBLE) { - search_frac(info); + if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { + search_frac(info, &best_cost, &best_bits, &best_mv); - } else if (info->best_cost < MAX_DOUBLE) { + } else if (best_cost < MAX_DOUBLE) { // Recalculate inter cost with SATD. - info->best_cost = kvz_image_calc_satd( + best_cost = kvz_image_calc_satd( info->state->tile->frame->source, info->ref, info->origin.x, info->origin.y, - info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2), - info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2), + info->state->tile->offset_x + info->origin.x + (best_mv.x >> 2), + info->state->tile->offset_y + info->origin.y + (best_mv.y >> 2), info->width, info->height); - info->best_cost += info->best_bitcost * info->state->lambda_sqrt; + best_cost += best_bits * info->state->lambda_sqrt; } - mv = info->best_mv; - // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); - info->best_bitcost += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + best_bits += cur_cu->inter.mv_dir - 1 + mv_ref_coded; // Update best unipreds for biprediction - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv && info->best_cost < MAX_DOUBLE) { + bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures unit_stats_map_t *cur_map = &amvp[ref_list]; @@ -1393,12 +1419,12 @@ static void search_pu_inter_ref(inter_search_info_t *info, pb->skipped = false; pb->inter.mv_dir = ref_list + 1; pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)mv.x; - pb->inter.mv[ref_list][1] = (int16_t)mv.y; + pb->inter.mv[ref_list][0] = (int16_t)best_mv.x; + pb->inter.mv[ref_list][1] = (int16_t)best_mv.y; CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); - cur_map->cost[entry] = info->best_cost; - cur_map->bits[entry] = info->best_bitcost; + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; cur_map->size++; } } From 574d6c45930e8b7a626093a918651ff1744ab7a1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 3 Dec 2021 22:11:49 +0200 Subject: [PATCH 017/135] Eliminate copy-paste logic from different ME algorithms. --- src/search_inter.c | 96 ++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 62 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index ba007022..cae57c61 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -633,19 +633,7 @@ static void tz_search(inter_search_info_t *info, const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - *best_cost = MAX_DOUBLE; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info, best_cost, best_bits, best_mv)) - { - return; - } - + vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 }; // step 2, grid search @@ -746,19 +734,6 @@ static void hexagon_search(inter_search_info_t *info, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - *best_cost = MAX_DOUBLE; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info, best_cost, best_bits, best_mv)) - { - return; - } - vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. @@ -850,19 +825,6 @@ static void diamond_search(inter_search_info_t *info, {0, -1}, {1, 0}, {0, 1}, {-1, 0}, {0, 0} }; - - *best_cost = MAX_DOUBLE; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv, best_cost, best_bits, best_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info, best_cost, best_bits, best_mv)) - { - return; - } // current motion vector vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; @@ -1361,34 +1323,44 @@ static void search_pu_inter_ref(inter_search_info_t *info, double best_cost = MAX_DOUBLE; uint32_t best_bits = MAX_INT; - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); - break; + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); - break; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_DIA: - diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); + break; - default: - hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + } + + if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { + search_frac(info, &best_cost, &best_bits, &best_mv); + + } } - if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { - search_frac(info, &best_cost, &best_bits, &best_mv); - - } else if (best_cost < MAX_DOUBLE) { + if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { // Recalculate inter cost with SATD. best_cost = kvz_image_calc_satd( info->state->tile->frame->source, From 70a393a3dca67d37863e14002da8900bcdc9e58f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 5 Dec 2021 00:21:09 +0200 Subject: [PATCH 018/135] Set mv candidates before cost calculations for bipred. Use the new struct for bipred. --- src/search_inter.c | 108 +++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index cae57c61..893c1ee8 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1443,11 +1443,23 @@ static void search_pu_inter_bipred(inter_search_info_t *info, continue; } - int16_t mv[2][2]; + cur_cu->inter.mv_dir = 3; + + cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; + cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + int16_t(*mv)[2] = cur_cu->inter.mv; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; + + cur_cu->merged = false; + cur_cu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); + } // Don't try merge candidates that don't satisfy mv constraints. if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) || @@ -1497,16 +1509,6 @@ static void search_pu_inter_bipred(inter_search_info_t *info, cost += info->state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; - - cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; - cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; - cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; - cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = 0; // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { @@ -1525,7 +1527,6 @@ static void search_pu_inter_bipred(inter_search_info_t *info, // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); int cu_mv_cand = select_mv_cand( info->state, info->mv_cand, @@ -1752,12 +1753,12 @@ static void search_pu_inter(encoder_state_t * const state, // Store unipred information of L0 and L1 for biprediction // Best cost will be left at MAX_DOUBLE if no valid CU is found - unit_stats_map_t amvp[2] = { { .size = 0 }, { .size = 0 } }; + unit_stats_map_t amvp[3] = { { .size = 0 }, { .size = 0 }, { .size = 0 } }; - for (int ref_list = 0; ref_list < 2; ++ref_list) { + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { - amvp[ref_list].unit[i] = *cur_pu; // TODO: only initialize what is necessary - amvp[ref_list].keys[i] = i; + amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary + amvp[mv_dir - 1].keys[i] = i; } } @@ -1799,6 +1800,8 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { + cu_info_t *bipred_pu = &amvp[2].unit[0]; + // Try biprediction from valid acquired unipreds. if (amvp[0].size > 0 && amvp[1].size > 0) { @@ -1809,15 +1812,27 @@ static void search_pu_inter(encoder_state_t * const state, inter_merge_cand_t *merge_cand = info.merge_cand; - int16_t mv[2][2]; + bipred_pu->inter.mv_dir = 3; + + bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + bipred_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; + + int16_t (*mv)[2] = bipred_pu->inter.mv; mv[0][0] = best_unipred[0]->inter.mv[0][0]; mv[0][1] = best_unipred[0]->inter.mv[0][1]; mv[1][0] = best_unipred[1]->inter.mv[1][0]; mv[1][1] = best_unipred[1]->inter.mv[1][1]; + + bipred_pu->merged = false; + bipred_pu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, bipred_pu, lcu, reflist); + } kvz_inter_recon_bipred(info.state, - ref->images[ref_LX[0][best_unipred[0]->inter.mv_ref[0]]], - ref->images[ref_LX[1][best_unipred[1]->inter.mv_ref[1]]], + ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], + ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], x, y, width, height, @@ -1834,74 +1849,71 @@ static void search_pu_inter(encoder_state_t * const state, uint32_t bitcost[2] = { 0, 0 }; cost += info.mvd_cost_func(info.state, - best_unipred[0]->inter.mv[0][0], - best_unipred[0]->inter.mv[0][1], + bipred_pu->inter.mv[0][0], + bipred_pu->inter.mv[0][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[0]); cost += info.mvd_cost_func(info.state, - best_unipred[1]->inter.mv[1][0], - best_unipred[1]->inter.mv[1][1], + bipred_pu->inter.mv[1][0], + bipred_pu->inter.mv[1][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[1]); const uint8_t mv_ref_coded[2] = { - best_unipred[0]->inter.mv_ref[0], - best_unipred[1]->inter.mv_ref[1] + bipred_pu->inter.mv_ref[0], + bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info.state->lambda_sqrt * extra_bits; if (cost < *inter_cost) { - cur_pu->inter.mv_dir = 3; - - cur_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; - cur_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - - cur_pu->inter.mv[0][0] = best_unipred[0]->inter.mv[0][0]; - cur_pu->inter.mv[0][1] = best_unipred[0]->inter.mv[0][1]; - cur_pu->inter.mv[1][0] = best_unipred[1]->inter.mv[1][0]; - cur_pu->inter.mv[1][1] = best_unipred[1]->inter.mv[1][1]; - cur_pu->merged = 0; // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_pu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_pu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_pu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_pu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_pu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_pu->inter.mv_ref[1]) + if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) { - cur_pu->merged = 1; - cur_pu->merge_idx = merge_idx; + bipred_pu->merged = 1; + bipred_pu->merge_idx = merge_idx; break; } } // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_pu, lcu, reflist); int cu_mv_cand = select_mv_cand( info.state, info.mv_cand, - cur_pu->inter.mv[reflist][0], - cur_pu->inter.mv[reflist][1], + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], NULL); - CU_SET_MV_CAND(cur_pu, reflist, cu_mv_cand); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } *inter_cost = cost; *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + + *cur_pu = *bipred_pu; } } // TODO: this probably should have a separate command line option if (cfg->rdo == 3) { - search_pu_inter_bipred(&info, depth, lcu, cur_pu, inter_cost, inter_bitcost); + cu_info_t bipred_pu = *cur_pu; + double prior_cost = *inter_cost; + search_pu_inter_bipred(&info, depth, lcu, &bipred_pu, inter_cost, inter_bitcost); + + if (*inter_cost < prior_cost) { + *cur_pu = bipred_pu; + } } } From adb31ce959a8eea88417ed5d43509a67ca9d0d70 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 5 Dec 2021 16:13:01 +0200 Subject: [PATCH 019/135] Use the new struct for bipred refinement as well --- src/search_inter.c | 111 +++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 55 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 893c1ee8..7727488a 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1409,9 +1409,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, */ static void search_pu_inter_bipred(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost) + lcu_t *lcu, + unit_stats_map_t *amvp_bipred) { const image_list_t *const ref = info->state->frame->ref; uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; @@ -1443,22 +1442,24 @@ static void search_pu_inter_bipred(inter_search_info_t *info, continue; } - cur_cu->inter.mv_dir = 3; + cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size]; - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; + bipred_pu->inter.mv_dir = 3; - int16_t(*mv)[2] = cur_cu->inter.mv; + bipred_pu->inter.mv_ref[0] = merge_cand[i].ref[0]; + bipred_pu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + int16_t(*mv)[2] = bipred_pu->inter.mv; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = false; - cur_cu->skipped = false; + bipred_pu->merged = false; + bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); } // Don't try merge candidates that don't satisfy mv constraints. @@ -1508,37 +1509,35 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info->state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } + // Check every candidate to find a match + for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { + if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) + { + bipred_pu->merged = true; + bipred_pu->merge_idx = merge_idx; + break; } - - // Each motion vector has its own candidate - for (int reflist = 0; reflist < 2; reflist++) { - int cu_mv_cand = select_mv_cand( - info->state, - info->mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], - NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); - } - - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; } + + // Each motion vector has its own candidate + for (int reflist = 0; reflist < 2; reflist++) { + int cu_mv_cand = select_mv_cand( + info->state, + info->mv_cand, + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], + NULL); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); + } + + amvp_bipred->cost[amvp_bipred->size] = cost; + amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; + amvp_bipred->size++; } } @@ -1801,6 +1800,8 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { cu_info_t *bipred_pu = &amvp[2].unit[0]; + double best_bipred_cost = MAX_DOUBLE; + uint32_t best_bipred_bits = MAX_INT; // Try biprediction from valid acquired unipreds. if (amvp[0].size > 0 && amvp[1].size > 0) { @@ -1843,19 +1844,19 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - double cost = + best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); uint32_t bitcost[2] = { 0, 0 }; - cost += info.mvd_cost_func(info.state, + best_bipred_cost += info.mvd_cost_func(info.state, bipred_pu->inter.mv[0][0], bipred_pu->inter.mv[0][1], 0, info.mv_cand, NULL, 0, 0, &bitcost[0]); - cost += info.mvd_cost_func(info.state, + best_bipred_cost += info.mvd_cost_func(info.state, bipred_pu->inter.mv[1][0], bipred_pu->inter.mv[1][1], 0, @@ -1868,9 +1869,9 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info.state->lambda_sqrt * extra_bits; + best_bipred_cost += info.state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { + if (best_bipred_cost < *inter_cost) { // Check every candidate to find a match for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { @@ -1898,22 +1899,22 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; - - *cur_pu = *bipred_pu; + amvp[2].cost[amvp[2].size] = best_bipred_cost; + amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].size++; } } // TODO: this probably should have a separate command line option - if (cfg->rdo == 3) { - cu_info_t bipred_pu = *cur_pu; - double prior_cost = *inter_cost; - search_pu_inter_bipred(&info, depth, lcu, &bipred_pu, inter_cost, inter_bitcost); + if (cfg->rdo == 3) search_pu_inter_bipred(&info, depth, lcu, &amvp[2]); + + kvz_sort_keys_by_cost(&amvp[2]); + int best_bipred_key = amvp[2].keys[0]; - if (*inter_cost < prior_cost) { - *cur_pu = bipred_pu; - } + if (amvp[2].size > 0 && amvp[2].cost[best_bipred_key] < *inter_cost) { + *inter_cost = amvp[2].cost[best_bipred_key]; + *inter_bitcost = amvp[2].bits[best_bipred_key]; + *cur_pu = amvp[2].unit[best_bipred_key]; } } From dc4676eef1b51521caafcd269d01664a6246b135 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 15:35:13 +0200 Subject: [PATCH 020/135] Remove merge attempts from bipred functions --- src/search_inter.c | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 7727488a..d6751d38 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1509,21 +1509,6 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; cost += info->state->lambda_sqrt * extra_bits; - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) - { - bipred_pu->merged = true; - bipred_pu->merge_idx = merge_idx; - break; - } - } - // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { int cu_mv_cand = select_mv_cand( @@ -1801,7 +1786,6 @@ static void search_pu_inter(encoder_state_t * const state, cu_info_t *bipred_pu = &amvp[2].unit[0]; double best_bipred_cost = MAX_DOUBLE; - uint32_t best_bipred_bits = MAX_INT; // Try biprediction from valid acquired unipreds. if (amvp[0].size > 0 && amvp[1].size > 0) { @@ -1811,8 +1795,6 @@ static void search_pu_inter(encoder_state_t * const state, const image_list_t *const ref = info.state->frame->ref; uint8_t(*ref_LX)[16] = info.state->frame->ref_LX; - inter_merge_cand_t *merge_cand = info.merge_cand; - bipred_pu->inter.mv_dir = 3; bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; @@ -1873,21 +1855,6 @@ static void search_pu_inter(encoder_state_t * const state, if (best_bipred_cost < *inter_cost) { - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == bipred_pu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == bipred_pu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == bipred_pu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == bipred_pu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == bipred_pu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == bipred_pu->inter.mv_ref[1]) - { - bipred_pu->merged = 1; - bipred_pu->merge_idx = merge_idx; - break; - } - } - // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { int cu_mv_cand = select_mv_cand( From 3a219146edea9a2de1ddc040765708bc5033585e Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 15:47:14 +0200 Subject: [PATCH 021/135] Rename some variables --- src/search_inter.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index d6751d38..df221bd4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1386,14 +1386,14 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Map reference index to L0/L1 pictures unit_stats_map_t *cur_map = &amvp[ref_list]; int entry = cur_map->size; - cu_info_t *pb = &cur_map->unit[entry]; - pb->merged = false; - pb->skipped = false; - pb->inter.mv_dir = ref_list + 1; - pb->inter.mv_ref[ref_list] = LX_idx; - pb->inter.mv[ref_list][0] = (int16_t)best_mv.x; - pb->inter.mv[ref_list][1] = (int16_t)best_mv.y; - CU_SET_MV_CAND(pb, ref_list, cu_mv_cand); + cu_info_t *unipred_pu = &cur_map->unit[entry]; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = ref_list + 1; + unipred_pu->inter.mv_ref[ref_list] = LX_idx; + unipred_pu->inter.mv[ref_list][0] = (int16_t)best_mv.x; + unipred_pu->inter.mv[ref_list][1] = (int16_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); cur_map->cost[entry] = best_cost; cur_map->bits[entry] = best_bits; @@ -1697,13 +1697,13 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { - for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { + for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { // Reconstruct blocks with merge candidate. // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge.unit[merge.keys[merge_rdo_idx]].merge_idx; + int merge_idx = merge.unit[merge.keys[merge_key]].merge_idx; cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; @@ -1756,25 +1756,25 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[0]); kvz_sort_keys_by_cost(&amvp[1]); - int best_idx[2] = { amvp[0].keys[0], amvp[1].keys[0] }; + int best_keys[2] = { amvp[0].keys[0], amvp[1].keys[0] }; double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; - if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_idx[0]]; - if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_idx[1]]; + if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_keys[0]]; + if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_keys[1]]; int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; cu_info_t *best_unipred[2] = { - &amvp[0].unit[best_idx[0]], - &amvp[1].unit[best_idx[1]] + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] }; // Set best valid unipred to cur_cu if (best_cost < MAX_DOUBLE) { // Map reference index to L0/L1 pictures *cur_pu = *best_unipred[best_list]; - *inter_cost = amvp[best_list].cost[best_idx[best_list]]; - *inter_bitcost = amvp[best_list].bits[best_idx[best_list]]; + *inter_cost = amvp[best_list].cost[best_keys[best_list]]; + *inter_bitcost = amvp[best_list].bits[best_keys[best_list]]; } // Search bi-pred positions @@ -1886,13 +1886,13 @@ static void search_pu_inter(encoder_state_t * const state, } // Compare best merge cost to amvp cost - int best_merge_indx = merge.keys[0]; - int best_merge_cost = merge.cost[best_merge_indx]; + int best_merge_key = merge.keys[0]; + int best_merge_cost = merge.cost[best_merge_key]; if (merge.size > 0 && best_merge_cost < *inter_cost) { *inter_cost = best_merge_cost; *inter_bitcost = 0; // TODO: Check this - *cur_pu = merge.unit[best_merge_indx]; + *cur_pu = merge.unit[best_merge_key]; } if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { From 2b9b398524c555b4bd6463f5bd47cc57e2bc5d16 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 16:05:23 +0200 Subject: [PATCH 022/135] Remove now unnecessary state store/restore --- src/search_inter.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index df221bd4..dcb45d2f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1232,9 +1232,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, int LX_idx = ref_list_idx[ref_list]; - // store temp values to be stored back later - int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; - // Get MV candidates cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; @@ -1248,9 +1245,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, lcu, ref_list); - // store old values back - cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - vector2d_t best_mv = { 0, 0 }; // Take starting point for MV search from previous frame. From 4d02b69c4e75d98d149a93c76cd5cf69fdcb163c Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 19:34:05 +0200 Subject: [PATCH 023/135] Set CU type in inter search functions --- src/search_inter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/search_inter.c b/src/search_inter.c index dcb45d2f..8db16cec 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1381,6 +1381,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, unit_stats_map_t *cur_map = &amvp[ref_list]; int entry = cur_map->size; cu_info_t *unipred_pu = &cur_map->unit[entry]; + unipred_pu->type = CU_INTER; unipred_pu->merged = false; unipred_pu->skipped = false; unipred_pu->inter.mv_dir = ref_list + 1; @@ -1514,6 +1515,8 @@ static void search_pu_inter_bipred(inter_search_info_t *info, CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } + bipred_pu->type = CU_INTER; + amvp_bipred->cost[amvp_bipred->size] = cost; amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; amvp_bipred->size++; From 0b223b24f21d43a42aadf514735a7725aa1129be Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 22:37:27 +0200 Subject: [PATCH 024/135] Fix comment --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 8db16cec..3c6f035d 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1732,7 +1732,7 @@ static void search_pu_inter(encoder_state_t * const state, // AMVP search starts here - // Store unipred information of L0 and L1 for biprediction + // Store information of L0, L1, and bipredictions // Best cost will be left at MAX_DOUBLE if no valid CU is found unit_stats_map_t amvp[3] = { { .size = 0 }, { .size = 0 }, { .size = 0 } }; From bdece66dc40c4873c7fd1e0b92b17eaf21c8f919 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 23:12:47 +0200 Subject: [PATCH 025/135] Compare the final costs only once and then set the current CU --- src/search_inter.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 3c6f035d..9d077acb 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1740,6 +1740,7 @@ static void search_pu_inter(encoder_state_t * const state, for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[mv_dir - 1].keys[i] = i; + amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; } } @@ -1766,14 +1767,6 @@ static void search_pu_inter(encoder_state_t * const state, &amvp[1].unit[best_keys[1]] }; - // Set best valid unipred to cur_cu - if (best_cost < MAX_DOUBLE) { - // Map reference index to L0/L1 pictures - *cur_pu = *best_unipred[best_list]; - *inter_cost = amvp[best_list].cost[best_keys[best_list]]; - *inter_bitcost = amvp[best_list].bits[best_keys[best_list]]; - } - // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -1850,7 +1843,7 @@ static void search_pu_inter(encoder_state_t * const state, const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; best_bipred_cost += info.state->lambda_sqrt * extra_bits; - if (best_bipred_cost < *inter_cost) { + if (best_bipred_cost < MAX_DOUBLE) { // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { @@ -1873,28 +1866,38 @@ static void search_pu_inter(encoder_state_t * const state, if (cfg->rdo == 3) search_pu_inter_bipred(&info, depth, lcu, &amvp[2]); kvz_sort_keys_by_cost(&amvp[2]); - int best_bipred_key = amvp[2].keys[0]; + } - if (amvp[2].size > 0 && amvp[2].cost[best_bipred_key] < *inter_cost) { - *inter_cost = amvp[2].cost[best_bipred_key]; - *inter_bitcost = amvp[2].bits[best_bipred_key]; - *cur_pu = amvp[2].unit[best_bipred_key]; + cu_info_t* best_inter_pu = NULL; + + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; } } // Compare best merge cost to amvp cost - int best_merge_key = merge.keys[0]; - int best_merge_cost = merge.cost[best_merge_key]; + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { - if (merge.size > 0 && best_merge_cost < *inter_cost) { - *inter_cost = best_merge_cost; - *inter_bitcost = 0; // TODO: Check this - *cur_pu = merge.unit[best_merge_key]; + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this } if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } + + *cur_pu = *best_inter_pu; } /** From 3e967c0077862fe762cda39059712712bb81ebf8 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 6 Dec 2021 23:30:34 +0200 Subject: [PATCH 026/135] Add missing assertion and set cu before --- src/search_inter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 9d077acb..88a896ac 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1893,11 +1893,15 @@ static void search_pu_inter(encoder_state_t * const state, *inter_bitcost = 0; // TODO: Check this } - if (*inter_cost < INT_MAX && cur_pu->inter.mv_dir == 1) { + *cur_pu = *best_inter_pu; + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } - *cur_pu = *best_inter_pu; + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } } /** From 4e19f7b71e673d2f5acf50ff3bf1d6aefc55b47f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 7 Dec 2021 00:35:50 +0200 Subject: [PATCH 027/135] Move mode decision logic and current PU setting to higher-level functions --- src/search_inter.c | 322 ++++++++++++++++++++++++++++----------------- 1 file changed, 199 insertions(+), 123 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 88a896ac..76fb8d54 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -68,7 +68,7 @@ typedef struct { /** * \brief Top-left corner of the PU */ - const vector2d_t origin; + vector2d_t origin; int32_t width; int32_t height; @@ -1555,7 +1555,7 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, } /** - * \brief Update PU to have best modes at this depth. + * \brief Collect PU parameters and costs at this depth. * * \param state encoder state * \param x_cu x-coordinate of the containing CU @@ -1565,28 +1565,26 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, * \param i_pu index of the PU in the CU * \param lcu containing LCU * - * \param inter_cost Return inter cost of the best mode - * \param inter_bitcost Return inter bitcost of the best mode + * \param amvp Return searched AMVP PUs sorted by costs + * \param merge Return searched Merge PUs sorted by costs */ static void search_pu_inter(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost) + int x_cu, int y_cu, + int depth, + part_mode_t part_mode, + int i_pu, + lcu_t *lcu, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + inter_search_info_t *info) { - *inter_cost = MAX_DOUBLE; - *inter_bitcost = MAX_INT; - const kvz_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); + const int width_cu = LCU_WIDTH >> depth; + const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); + const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); + const int width = PU_GET_W(part_mode, width_cu, i_pu); + const int height = PU_GET_H(part_mode, width_cu, i_pu); // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and // nRx2N partitions. @@ -1595,31 +1593,31 @@ static void search_pu_inter(encoder_state_t * const state, // 2NxnD partitions. const bool merge_b1 = i_pu == 0 || width <= height; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); cur_pu->type = CU_NOTSET; cur_pu->part_size = part_mode; cur_pu->depth = depth; cur_pu->qp = state->qp; - inter_search_info_t info = { - .state = state, - .pic = frame->source, - .origin = { x, y }, - .width = width, - .height = height, - .mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost, - .optimized_sad = kvz_get_optimized_sad(width), - }; + + info->state = state; + info->pic = frame->source; + info->origin.x = x; + info->origin.y = y; + info->width = width; + info->height = height; + info->mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost; + info->optimized_sad = kvz_get_optimized_sad(width); // Search for merge mode candidates - info.num_merge_cand = kvz_inter_get_merge_cand( + info->num_merge_cand = kvz_inter_get_merge_cand( state, x, y, width, height, merge_a1, merge_b1, - info.merge_cand, + info->merge_cand, lcu ); @@ -1628,16 +1626,16 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_pu, 1, 0); // Merge Analysis starts here - unit_stats_map_t merge = { .size = 0 }; + merge->size = 0; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - merge.keys[i] = -1; - merge.cost[i] = MAX_DOUBLE; + merge->keys[i] = -1; + merge->cost[i] = MAX_DOUBLE; } // Check motion vector constraints and perform rough search - for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { + for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { - inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; + inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx]; cur_pu->inter.mv_dir = cur_cand->dir; cur_pu->inter.mv_ref[0] = cur_cand->ref[0]; cur_pu->inter.mv_ref[1] = cur_cand->ref[1]; @@ -1651,16 +1649,16 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; - bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - merge.keys, - merge.size); + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, + merge->keys, + merge->size); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list bool active_L0 = cur_pu->inter.mv_dir & 1; bool active_L1 = cur_pu->inter.mv_dir & 2; - if (active_L0 && !fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || - active_L1 && !fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || + if (active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || + active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || is_duplicate) { continue; @@ -1668,28 +1666,28 @@ static void search_pu_inter(encoder_state_t * const state, kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - merge.cost[merge.size] = kvz_satd_any_size(width, height, + merge->cost[merge->size] = kvz_satd_any_size(width, height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); // Add cost of coding the merge index - merge.cost[merge.size] += merge_idx * info.state->lambda_sqrt; - merge.bits[merge.size] = merge_idx; - merge.keys[merge.size] = merge.size; + merge->cost[merge->size] += merge_idx * info->state->lambda_sqrt; + merge->bits[merge->size] = merge_idx; + merge->keys[merge->size] = merge->size; - merge.unit[merge.size] = *cur_pu; - merge.unit[merge.size].type = CU_INTER; - merge.unit[merge.size].merge_idx = merge_idx; - merge.unit[merge.size].merged = true; - merge.unit[merge.size].skipped = false; + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_INTER; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; - merge.size++; + merge->size++; } - kvz_sort_keys_by_cost(&merge); + kvz_sort_keys_by_cost(merge); // Try early skip decision on just one merge candidate if available - int num_rdo_cands = MIN(1, merge.size); + int num_rdo_cands = MIN(1, merge->size); // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1700,14 +1698,14 @@ static void search_pu_inter(encoder_state_t * const state, // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge.unit[merge.keys[merge_key]].merge_idx; - cur_pu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_pu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_pu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_pu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_pu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_pu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_pu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); kvz_inter_recon_cu(state, lcu, x, y, width, true, false); kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); @@ -1722,8 +1720,11 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->type = CU_INTER; cur_pu->merge_idx = merge_idx; cur_pu->skipped = true; - *inter_cost = 0.0; // TODO: Check this - *inter_bitcost = merge_idx; // TODO: Check this + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; return; } } @@ -1732,9 +1733,9 @@ static void search_pu_inter(encoder_state_t * const state, // AMVP search starts here - // Store information of L0, L1, and bipredictions - // Best cost will be left at MAX_DOUBLE if no valid CU is found - unit_stats_map_t amvp[3] = { { .size = 0 }, { .size = 0 }, { .size = 0 } }; + amvp[0].size = 0; + amvp[1].size = 0; + amvp[2].size = 0; for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { @@ -1745,10 +1746,10 @@ static void search_pu_inter(encoder_state_t * const state, } for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { - info.ref_idx = ref_idx; - info.ref = state->frame->ref->images[ref_idx]; + info->ref_idx = ref_idx; + info->ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_pu, amvp); + search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); } kvz_sort_keys_by_cost(&amvp[0]); @@ -1782,8 +1783,8 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: logic is copy paste from search_pu_inter_bipred. // Get rid of duplicate code asap. - const image_list_t *const ref = info.state->frame->ref; - uint8_t(*ref_LX)[16] = info.state->frame->ref_LX; + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; bipred_pu->inter.mv_dir = 3; @@ -1800,10 +1801,10 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, bipred_pu, lcu, reflist); + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); } - kvz_inter_recon_bipred(info.state, + kvz_inter_recon_bipred(info->state, ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], x, y, @@ -1821,18 +1822,18 @@ static void search_pu_inter(encoder_state_t * const state, uint32_t bitcost[2] = { 0, 0 }; - best_bipred_cost += info.mvd_cost_func(info.state, + best_bipred_cost += info->mvd_cost_func(info->state, bipred_pu->inter.mv[0][0], bipred_pu->inter.mv[0][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[0]); - best_bipred_cost += info.mvd_cost_func(info.state, + best_bipred_cost += info->mvd_cost_func(info->state, bipred_pu->inter.mv[1][0], bipred_pu->inter.mv[1][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[1]); @@ -1841,15 +1842,15 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - best_bipred_cost += info.state->lambda_sqrt * extra_bits; + best_bipred_cost += info->state->lambda_sqrt * extra_bits; if (best_bipred_cost < MAX_DOUBLE) { // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { int cu_mv_cand = select_mv_cand( - info.state, - info.mv_cand, + info->state, + info->mv_cand, bipred_pu->inter.mv[reflist][0], bipred_pu->inter.mv[reflist][1], NULL); @@ -1863,45 +1864,10 @@ static void search_pu_inter(encoder_state_t * const state, } // TODO: this probably should have a separate command line option - if (cfg->rdo == 3) search_pu_inter_bipred(&info, depth, lcu, &amvp[2]); + if (cfg->rdo == 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); kvz_sort_keys_by_cost(&amvp[2]); } - - cu_info_t* best_inter_pu = NULL; - - for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { - - int best_key = amvp[mv_dir - 1].keys[0]; - - if (amvp[mv_dir - 1].size > 0 && - amvp[mv_dir - 1].cost[best_key] < *inter_cost) { - - best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; - *inter_cost = amvp[mv_dir - 1].cost[best_key]; - *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; - } - } - - // Compare best merge cost to amvp cost - int best_merge_key = merge.keys[0]; - - if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { - - best_inter_pu = &merge.unit[best_merge_key]; - *inter_cost = merge.cost[best_merge_key]; - *inter_bitcost = 0; // TODO: Check this - } - - *cur_pu = *best_inter_pu; - - if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { - assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); - } - - if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { - assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); - } } /** @@ -1973,13 +1939,69 @@ void kvz_search_cu_inter(encoder_state_t * const state, double *inter_cost, uint32_t *inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + search_pu_inter(state, x, y, depth, SIZE_2Nx2N, 0, lcu, - inter_cost, - inter_bitcost); + amvp, + &merge, + &info); + // Early Skip CU decision + if (merge.size == 1 && merge.unit[0].skipped) { + *inter_cost = merge.cost[0]; + *inter_bitcost = merge.bits[0]; + return; + } + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this + } + + if (*inter_cost == MAX_DOUBLE) { + // Could not find any motion vector. + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + return; + } + + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + *cur_pu = *best_inter_pu; + // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, @@ -1988,6 +2010,14 @@ void kvz_search_cu_inter(encoder_state_t * const state, inter_cost, inter_bitcost); } + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } } @@ -2014,6 +2044,16 @@ void kvz_search_cu_smp(encoder_state_t * const state, double *inter_cost, uint32_t *inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + const int num_pu = kvz_part_mode_num_parts[part_mode]; const int width = LCU_WIDTH >> depth; const int y_local = SUB_SCU(y); @@ -2031,19 +2071,47 @@ void kvz_search_cu_smp(encoder_state_t * const state, double cost = MAX_DOUBLE; uint32_t bitcost = MAX_INT; - search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); + search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + cost = amvp[mv_dir - 1].cost[best_key]; + bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + cost = merge.cost[best_merge_key]; + bitcost = 0; // TODO: Check this + } if (cost == MAX_DOUBLE) { // Could not find any motion vector. - *inter_cost = MAX_DOUBLE; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; return; } - *inter_cost += cost; + *inter_cost += cost; *inter_bitcost += bitcost; cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); + *cur_pu = *best_inter_pu; + for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); @@ -2051,6 +2119,14 @@ void kvz_search_cu_smp(encoder_state_t * const state, scu->inter = cur_pu->inter; } } + + if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } } // Count bits spent for coding the partition mode. From 7f7112cc5762c145171f30ec9d24a52c8ab30387 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 7 Dec 2021 22:04:41 +0200 Subject: [PATCH 028/135] Use up-to-date value of mv dir for bit cost calculations --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 76fb8d54..5112c0b7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1371,7 +1371,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); - best_bits += cur_cu->inter.mv_dir - 1 + mv_ref_coded; + best_bits += ref_list + mv_ref_coded; // Update best unipreds for biprediction bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); From 706d718d5d38f8b9a408bf726f1aa0daf09f6684 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 00:49:19 +0200 Subject: [PATCH 029/135] Perform FME for n best PUs from L0 and L1. --- src/search_inter.c | 74 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 5112c0b7..50548b1f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1347,11 +1347,6 @@ static void search_pu_inter_ref(inter_search_info_t *info, &best_cost, &best_bits, &best_mv); break; } - - if (cfg->fme_level > 0 && best_cost < MAX_DOUBLE) { - search_frac(info, &best_cost, &best_bits, &best_mv); - - } } if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { @@ -1768,6 +1763,75 @@ static void search_pu_inter(encoder_state_t * const state, &amvp[1].unit[best_keys[1]] }; + // Fractional-pixel motion estimation. + // Refine the best PUs so far from both lists, if available. + for (int list = 0; list < 2; ++list) { + + // TODO: make configurable + int n_best = MIN(1, amvp[list].size); + if (cfg->fme_level > 0) { + + for (int i = 0; i < n_best; ++i) { + + int key = amvp[list].keys[i]; + cu_info_t *unipred_pu = &amvp[list].unit[key]; + + // Find the reference picture + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int LX_idx = unipred_pu->inter.mv_ref[list]; + info->ref_idx = ref_LX[list][LX_idx]; + info->ref = ref->images[info->ref_idx]; + + kvz_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + unipred_pu, + lcu, + list); + + double *cost = &amvp[list].cost[key]; + + double frac_cost = MAX_DOUBLE; + uint32_t frac_bits = MAX_INT; + vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; + + search_frac(info, &frac_cost, &frac_bits, &frac_mv); + + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL); + frac_bits += list + mv_ref_coded; + + bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y); + if (valid_mv) { + + unipred_pu->inter.mv[list][0] = frac_mv.x; + unipred_pu->inter.mv[list][1] = frac_mv.y; + CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); + + amvp[list].cost[key] = frac_cost; + amvp[list].bits[key] = frac_bits; + } + } + + // Invalidate PUs with SAD-based costs. (FME not performed). + // TODO: Recalculate SAD costs with SATD for further processing. + for (int i = n_best; i < amvp[list].size; ++i) { + int key = amvp[list].keys[i]; + amvp[list].cost[key] = MAX_DOUBLE; + } + } + + // Costs are now, SATD-based. Omit PUs with SAD-based costs. + // TODO: Recalculate SAD costs with SATD for further processing. + kvz_sort_keys_by_cost(&amvp[list]); + amvp[list].size = n_best; + } + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred From 1af90b194efc21e9e35e09d8c24fbbbda602c4b1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 15:27:05 +0200 Subject: [PATCH 030/135] Add missing bits to RD costs. --- src/search_inter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 50548b1f..16da0168 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1366,7 +1366,9 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Only check when candidates are different uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); - best_bits += ref_list + mv_ref_coded; + const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing + best_cost += extra_bits * info->state->lambda_sqrt; + best_bits += extra_bits; // Update best unipreds for biprediction bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); @@ -1804,7 +1806,9 @@ static void search_pu_inter(encoder_state_t * const state, uint8_t mv_ref_coded = LX_idx; int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL); - frac_bits += list + mv_ref_coded; + const int extra_bits = list + mv_ref_coded; // TODO: check if mv_dir bits are missing + frac_cost += extra_bits * info->state->lambda_sqrt; + frac_bits += extra_bits; bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y); if (valid_mv) { From ae498553c0f3f825519924584661812b6b1abf97 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 15:53:31 +0200 Subject: [PATCH 031/135] Add define MAX_UNIT_STATS_MAP_SIZE. Add assertions to inter search. --- src/search.h | 9 +++++---- src/search_inter.c | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/search.h b/src/search.h index de34755b..e4b299c3 100644 --- a/src/search.h +++ b/src/search.h @@ -44,6 +44,7 @@ #include "image.h" #include "constraint.h" +#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS) /** * \brief Data collected during search processes. @@ -56,10 +57,10 @@ */ typedef struct unit_stats_map_t { - cu_info_t unit[MAX_REF_PIC_COUNT]; //!< list of searched units - double cost[MAX_REF_PIC_COUNT]; //!< list of matching RD costs - uint32_t bits[MAX_REF_PIC_COUNT]; //!< list of matching bit costs - int8_t keys[MAX_REF_PIC_COUNT]; //!< list of keys (indices) to elements in the other arrays + cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units + double cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs + uint32_t bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs + int8_t keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays int size; //!< number of active elements in the lists } unit_stats_map_t; diff --git a/src/search_inter.c b/src/search_inter.c index 16da0168..9f317021 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1681,6 +1681,7 @@ static void search_pu_inter(encoder_state_t * const state, merge->size++; } + assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(merge); // Try early skip decision on just one merge candidate if available @@ -1735,7 +1736,7 @@ static void search_pu_inter(encoder_state_t * const state, amvp[2].size = 0; for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { - for (int i = 0; i < MAX_REF_PIC_COUNT; ++i) { + for (int i = 0; i < state->frame->ref->used_size; ++i) { amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[mv_dir - 1].keys[i] = i; amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; @@ -1749,6 +1750,8 @@ static void search_pu_inter(encoder_state_t * const state, search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); } + assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); + assert(amvp[1].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[0]); kvz_sort_keys_by_cost(&amvp[1]); @@ -1934,6 +1937,7 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: this probably should have a separate command line option if (cfg->rdo == 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); + assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); } } From 49935710a8808fb790e9286fe82775294b6b9cb2 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 16:35:47 +0200 Subject: [PATCH 032/135] Only one ME per reference picture (same ref in L0 and L1) --- src/search_inter.c | 294 +++++++++++++++++++++++---------------------- 1 file changed, 149 insertions(+), 145 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 9f317021..4dbe2db9 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1227,170 +1227,174 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Must find at least one reference picture assert(ref_list_active[0] || ref_list_active[1]); - for (int ref_list = 0; ref_list < 2; ++ref_list) { - if (ref_list_active[ref_list]) { + // Does not matter which list is used, if in both. + int ref_list = ref_list_active[0] ? 0 : 1; + int LX_idx = ref_list_idx[ref_list]; - int LX_idx = ref_list_idx[ref_list]; + // Get MV candidates + cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; - // Get MV candidates - cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; + kvz_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + cur_cu, + lcu, + ref_list); - kvz_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - cur_cu, - lcu, - ref_list); + vector2d_t best_mv = { 0, 0 }; - vector2d_t best_mv = { 0, 0 }; - - // Take starting point for MV search from previous frame. - // When temporal motion vector candidates are added, there is probably - // no point to this anymore, but for now it helps. - const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); - const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); - const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; - const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); - if (ref_cu->type == CU_INTER) { - vector2d_t mv_previous = { 0, 0 }; - if (ref_cu->inter.mv_dir & 1) { - mv_previous.x = ref_cu->inter.mv[0][0]; - mv_previous.y = ref_cu->inter.mv[0][1]; - } else { - mv_previous.x = ref_cu->inter.mv[1][0]; - mv_previous.y = ref_cu->inter.mv[1][1]; - } - // Apply mv scaling if neighbor poc is available - if (info->state->frame->ref_LX_size[ref_list] > 0) { - // When there are reference pictures from the future (POC > current POC) - // in L0 or L1, the primary list for the colocated PU is the inverse of - // collocated_from_l0_flag. Otherwise it is equal to reflist. - // - // Kvazaar always sets collocated_from_l0_flag so the list is L1 when - // there are future references. - int col_list = ref_list; - for (int i = 0; i < info->state->frame->ref->used_size; i++) { - if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { - col_list = 1; - break; - } - } - if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { - // Use the other list if the colocated PU does not have a MV for the - // primary list. - col_list = 1 - col_list; - } - - uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; - // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument - apply_mv_scaling( - info->state->frame->poc, - info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], - info->state->frame->ref->pocs[neighbor_poc_index], - info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ - info->state->frame->ref->ref_LXs[neighbor_poc_index] - [col_list] - [ref_cu->inter.mv_ref[col_list]] - ], - &mv_previous - ); - } - - // Check if the mv is valid after scaling - if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - best_mv = mv_previous; + // Take starting point for MV search from previous frame. + // When temporal motion vector candidates are added, there is probably + // no point to this anymore, but for now it helps. + const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1); + const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1); + const cu_array_t* ref_array = info->state->frame->ref->cu_arrays[info->ref_idx]; + const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y); + if (ref_cu->type == CU_INTER) { + vector2d_t mv_previous = { 0, 0 }; + if (ref_cu->inter.mv_dir & 1) { + mv_previous.x = ref_cu->inter.mv[0][0]; + mv_previous.y = ref_cu->inter.mv[0][1]; + } else { + mv_previous.x = ref_cu->inter.mv[1][0]; + mv_previous.y = ref_cu->inter.mv[1][1]; + } + // Apply mv scaling if neighbor poc is available + if (info->state->frame->ref_LX_size[ref_list] > 0) { + // When there are reference pictures from the future (POC > current POC) + // in L0 or L1, the primary list for the colocated PU is the inverse of + // collocated_from_l0_flag. Otherwise it is equal to reflist. + // + // Kvazaar always sets collocated_from_l0_flag so the list is L1 when + // there are future references. + int col_list = ref_list; + for (int i = 0; i < info->state->frame->ref->used_size; i++) { + if (info->state->frame->ref->pocs[i] > info->state->frame->poc) { + col_list = 1; + break; } } - - int search_range = 32; - switch (cfg->ime_algorithm) { - case KVZ_IME_FULL64: search_range = 64; break; - case KVZ_IME_FULL32: search_range = 32; break; - case KVZ_IME_FULL16: search_range = 16; break; - case KVZ_IME_FULL8: search_range = 8; break; - default: break; + if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) { + // Use the other list if the colocated PU does not have a MV for the + // primary list. + col_list = 1 - col_list; } - double best_cost = MAX_DOUBLE; - uint32_t best_bits = MAX_INT; + uint8_t neighbor_poc_index = info->state->frame->ref_LX[ref_list][LX_idx]; + // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument + apply_mv_scaling( + info->state->frame->poc, + info->state->frame->ref->pocs[info->state->frame->ref_LX[ref_list][LX_idx]], + info->state->frame->ref->pocs[neighbor_poc_index], + info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ + info->state->frame->ref->ref_LXs[neighbor_poc_index] + [col_list] + [ref_cu->inter.mv_ref[col_list]] + ], + &mv_previous + ); + } - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); - bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + // Check if the mv is valid after scaling + if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { + best_mv = mv_previous; + } + } + + int search_range = 32; + switch (cfg->ime_algorithm) { + case KVZ_IME_FULL64: search_range = 64; break; + case KVZ_IME_FULL32: search_range = 32; break; + case KVZ_IME_FULL16: search_range = 16; break; + case KVZ_IME_FULL8: search_range = 8; break; + default: break; + } + + double best_cost = MAX_DOUBLE; + uint32_t best_bits = MAX_INT; + + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); - if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); - break; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_DIA: - diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; + case KVZ_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; - default: - hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); - break; - } - } + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + } + } - if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { - // Recalculate inter cost with SATD. - best_cost = kvz_image_calc_satd( - info->state->tile->frame->source, - info->ref, - info->origin.x, - info->origin.y, - info->state->tile->offset_x + info->origin.x + (best_mv.x >> 2), - info->state->tile->offset_y + info->origin.y + (best_mv.y >> 2), - info->width, - info->height); - best_cost += best_bits * info->state->lambda_sqrt; - } + if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + best_cost = kvz_image_calc_satd( + info->state->tile->frame->source, + info->ref, + info->origin.x, + info->origin.y, + info->state->tile->offset_x + info->origin.x + (best_mv.x >> 2), + info->state->tile->offset_y + info->origin.y + (best_mv.y >> 2), + info->width, + info->height); + best_cost += best_bits * info->state->lambda_sqrt; + } - // Only check when candidates are different - uint8_t mv_ref_coded = LX_idx; - int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); - const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing - best_cost += extra_bits * info->state->lambda_sqrt; - best_bits += extra_bits; + double LX_cost[2] = { best_cost, best_cost }; + double LX_bits[2] = { best_bits, best_bits }; - // Update best unipreds for biprediction - bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); - if (valid_mv && best_cost < MAX_DOUBLE) { + // Compute costs and add entries for both lists, if necessary + for (; ref_list_active[ref_list] && ref_list < 2; ++ref_list) { - // Map reference index to L0/L1 pictures - unit_stats_map_t *cur_map = &amvp[ref_list]; - int entry = cur_map->size; - cu_info_t *unipred_pu = &cur_map->unit[entry]; - unipred_pu->type = CU_INTER; - unipred_pu->merged = false; - unipred_pu->skipped = false; - unipred_pu->inter.mv_dir = ref_list + 1; - unipred_pu->inter.mv_ref[ref_list] = LX_idx; - unipred_pu->inter.mv[ref_list][0] = (int16_t)best_mv.x; - unipred_pu->inter.mv[ref_list][1] = (int16_t)best_mv.y; - CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); + LX_idx = ref_list_idx[ref_list]; + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing + LX_cost[ref_list] += extra_bits * info->state->lambda_sqrt; + LX_bits[ref_list] += extra_bits; - cur_map->cost[entry] = best_cost; - cur_map->bits[entry] = best_bits; - cur_map->size++; - } + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { + + // Map reference index to L0/L1 pictures + unit_stats_map_t *cur_map = &amvp[ref_list]; + int entry = cur_map->size; + cu_info_t *unipred_pu = &cur_map->unit[entry]; + unipred_pu->type = CU_INTER; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = ref_list + 1; + unipred_pu->inter.mv_ref[ref_list] = LX_idx; + unipred_pu->inter.mv[ref_list][0] = (int16_t)best_mv.x; + unipred_pu->inter.mv[ref_list][1] = (int16_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); + + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; + cur_map->size++; } } } From c411e659775405817e0a9857bfb715339ef28458 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 18:31:09 +0200 Subject: [PATCH 033/135] Prevent FME and bipred from the same reference picture if present in L0 and L1 --- src/search_inter.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 4dbe2db9..cca383b4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1760,6 +1760,38 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[1]); int best_keys[2] = { amvp[0].keys[0], amvp[1].keys[0] }; + + cu_info_t *best_unipred[2] = { + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] + }; + + // Prevent using the same ref picture with both lists. + // TODO: allow searching two MVs from the same reference picture. + if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) { + + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int L0_idx = best_unipred[0]->inter.mv_ref[0]; + int L1_idx = best_unipred[1]->inter.mv_ref[1]; + + int L0_ref_idx = ref_LX[0][L0_idx]; + int L1_ref_idx = ref_LX[1][L1_idx]; + + if (L0_ref_idx == L1_ref_idx) { + // Invalidate the other based the list that has the 2nd best PU + double L0_2nd_cost = amvp[0].size > 1 ? amvp[0].cost[amvp[0].keys[1]] : MAX_DOUBLE; + double L1_2nd_cost = amvp[1].size > 1 ? amvp[1].cost[amvp[1].keys[1]] : MAX_DOUBLE; + int list = (L0_2nd_cost <= L1_2nd_cost) ? 1 : 0; + amvp[list].cost[best_keys[list]] = MAX_DOUBLE; + kvz_sort_keys_by_cost(&amvp[list]); + amvp[list].size--; + best_keys[list] = amvp[list].keys[0]; + best_unipred[list] = &amvp[list].unit[best_keys[list]]; + } + } + double best_cost_L0 = MAX_DOUBLE; double best_cost_L1 = MAX_DOUBLE; if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_keys[0]]; @@ -1767,11 +1799,6 @@ static void search_pu_inter(encoder_state_t * const state, int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; - cu_info_t *best_unipred[2] = { - &amvp[0].unit[best_keys[0]], - &amvp[1].unit[best_keys[1]] - }; - // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { From f17a500b779f8c87d39410cc42279542380ae7af Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 21:06:12 +0200 Subject: [PATCH 034/135] Get rid of warnings. (Unused variables, suggested parentheses) --- src/search_inter.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index cca383b4..15606d52 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1658,8 +1658,8 @@ static void search_pu_inter(encoder_state_t * const state, // Don't add duplicates to list bool active_L0 = cur_pu->inter.mv_dir & 1; bool active_L1 = cur_pu->inter.mv_dir & 2; - if (active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]) || - active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]) || + if ((active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) || + (active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])) || is_duplicate) { continue; @@ -1770,7 +1770,6 @@ static void search_pu_inter(encoder_state_t * const state, // TODO: allow searching two MVs from the same reference picture. if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) { - const image_list_t *const ref = info->state->frame->ref; uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; int L0_idx = best_unipred[0]->inter.mv_ref[0]; @@ -1792,13 +1791,6 @@ static void search_pu_inter(encoder_state_t * const state, } } - double best_cost_L0 = MAX_DOUBLE; - double best_cost_L1 = MAX_DOUBLE; - if (amvp[0].size > 0) best_cost_L0 = amvp[0].cost[best_keys[0]]; - if (amvp[1].size > 0) best_cost_L1 = amvp[1].cost[best_keys[1]]; - int best_list = (best_cost_L0 <= best_cost_L1) ? 0 : 1; - int best_cost = (best_cost_L0 <= best_cost_L1) ? best_cost_L0 : best_cost_L1; - // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { @@ -1830,8 +1822,6 @@ static void search_pu_inter(encoder_state_t * const state, lcu, list); - double *cost = &amvp[list].cost[key]; - double frac_cost = MAX_DOUBLE; uint32_t frac_bits = MAX_INT; vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; From e87b12dec17d6b0135f0ce9fcbad47669ffeceb9 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 21:09:16 +0200 Subject: [PATCH 035/135] Move mv_cand initialization to better place --- src/search_inter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 15606d52..659a112c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1602,6 +1602,9 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->depth = depth; cur_pu->qp = state->qp; + // Default to candidate 0 + CU_SET_MV_CAND(cur_pu, 0, 0); + CU_SET_MV_CAND(cur_pu, 1, 0); info->state = state; info->pic = frame->source; @@ -1622,10 +1625,6 @@ static void search_pu_inter(encoder_state_t * const state, lcu ); - // Default to candidate 0 - CU_SET_MV_CAND(cur_pu, 0, 0); - CU_SET_MV_CAND(cur_pu, 1, 0); - // Merge Analysis starts here merge->size = 0; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { From bb1f2a0895d4489464fac4348ccb9c738d707652 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 21:13:25 +0200 Subject: [PATCH 036/135] Reorder condition to prevent indexing past the array --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 659a112c..6afa6c22 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1366,7 +1366,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, double LX_bits[2] = { best_bits, best_bits }; // Compute costs and add entries for both lists, if necessary - for (; ref_list_active[ref_list] && ref_list < 2; ++ref_list) { + for (; ref_list < 2 && ref_list_active[ref_list]; ++ref_list) { LX_idx = ref_list_idx[ref_list]; uint8_t mv_ref_coded = LX_idx; From a1a7036445c66571d1725c2d9c58917aff81da92 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 8 Dec 2021 23:03:18 +0200 Subject: [PATCH 037/135] Fix indexing. Get rid of warning about jump depending on uninitialized value. --- src/search_inter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 6afa6c22..67a3166d 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1534,14 +1534,14 @@ static void search_pu_inter_bipred(inter_search_info_t *info, * * \return Does an identical candidate exist in list */ -static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, - inter_merge_cand_t * cand_to_add, - int8_t * added_idx_list, - int list_size) +static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, + inter_merge_cand_t *cand_to_add, + unit_stats_map_t *merge) { bool found = false; - for (int i = 0; i < list_size && !found; ++i) { - inter_merge_cand_t * list_cand = &all_cands[added_idx_list[i]]; + for (int i = 0; i < merge->size && !found; ++i) { + int key = merge->keys[i]; + inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx]; found = cand_to_add->dir == list_cand->dir && cand_to_add->ref[0] == list_cand->ref[0] && @@ -1606,6 +1606,8 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_pu, 0, 0); CU_SET_MV_CAND(cur_pu, 1, 0); + FILL(*info, 0); + info->state = state; info->pic = frame->source; info->origin.x = x; @@ -1649,9 +1651,7 @@ static void search_pu_inter(encoder_state_t * const state, if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; - bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, - merge->keys, - merge->size); + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list From e45c6a9c68fa511a10a97f6a0ac13d59b264484f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 10 Dec 2021 00:02:26 +0200 Subject: [PATCH 038/135] Fix too few added keys in inter search stats. The function search_pu_inter_bipred may add more PUs than there are reference pictures. --- src/search_inter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 67a3166d..e670683c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1394,6 +1394,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, cur_map->cost[entry] = best_cost; cur_map->bits[entry] = best_bits; + cur_map->keys[entry] = entry; cur_map->size++; } } @@ -1520,6 +1521,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, amvp_bipred->cost[amvp_bipred->size] = cost; amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; + amvp_bipred->keys[amvp_bipred->size] = amvp_bipred->size; amvp_bipred->size++; } } @@ -1741,7 +1743,6 @@ static void search_pu_inter(encoder_state_t * const state, for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < state->frame->ref->used_size; ++i) { amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary - amvp[mv_dir - 1].keys[i] = i; amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; } } @@ -1950,6 +1951,7 @@ static void search_pu_inter(encoder_state_t * const state, amvp[2].cost[amvp[2].size] = best_bipred_cost; amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].keys[amvp[2].size] = amvp[2].size; amvp[2].size++; } } From 2424a976a408112e24f2eed1617d246025b15fab Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 10 Dec 2021 00:21:58 +0200 Subject: [PATCH 039/135] Prevent using uninitialized memory --- src/search_inter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index e670683c..59aa7342 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1759,7 +1759,10 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[0]); kvz_sort_keys_by_cost(&amvp[1]); - int best_keys[2] = { amvp[0].keys[0], amvp[1].keys[0] }; + int best_keys[2] = { + amvp[0].size > 0 ? amvp[0].keys[0] : 0, + amvp[1].size > 0 ? amvp[1].keys[0] : 0 + }; cu_info_t *best_unipred[2] = { &amvp[0].unit[best_keys[0]], From 51dd942778ef30288bcc270adc79bf5ba685f1a2 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 10 Dec 2021 00:32:08 +0200 Subject: [PATCH 040/135] Fix uninitialized fields of CU/PU infos. --- src/search_inter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 59aa7342..08594b9f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1383,6 +1383,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, unit_stats_map_t *cur_map = &amvp[ref_list]; int entry = cur_map->size; cu_info_t *unipred_pu = &cur_map->unit[entry]; + *unipred_pu = *cur_cu; unipred_pu->type = CU_INTER; unipred_pu->merged = false; unipred_pu->skipped = false; @@ -1440,6 +1441,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, } cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size]; + *bipred_pu = *LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); bipred_pu->inter.mv_dir = 3; @@ -1742,7 +1744,6 @@ static void search_pu_inter(encoder_state_t * const state, for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { for (int i = 0; i < state->frame->ref->used_size; ++i) { - amvp[mv_dir - 1].unit[i] = *cur_pu; // TODO: only initialize what is necessary amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; } } @@ -1871,6 +1872,7 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { cu_info_t *bipred_pu = &amvp[2].unit[0]; + *bipred_pu = *cur_pu; double best_bipred_cost = MAX_DOUBLE; // Try biprediction from valid acquired unipreds. From 6c50939af3261d1bbe449319509d12e26f6d728c Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 21 Mar 2018 10:46:30 +0200 Subject: [PATCH 041/135] Different roi-matrix for each frame Implement reading the roi-matrix for each frame from binary file. Extremely simple and breaks on any unhappy paths. # Conflicts: # src/cfg.c # src/cli.c # src/encoder.c # src/image.c # src/rate_control.c --- src/cfg.c | 13 +++++++++++++ src/cli.c | 1 + src/encmain.c | 29 ++++++++++++++++++++++++++++- src/encoder.c | 2 +- src/image.c | 7 +++++++ src/kvazaar.h | 10 ++++++++++ src/rate_control.c | 23 ++++++++++++++++++++++- 7 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/cfg.c b/src/cfg.c index 07c71a55..6a1fcf40 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -142,6 +142,8 @@ int kvz_config_init(kvz_config *cfg) cfg->roi.width = 0; cfg->roi.height = 0; cfg->roi.dqps = NULL; + + cfg->roi_file = NULL; cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -190,6 +192,7 @@ int kvz_config_destroy(kvz_config *cfg) { if (cfg) { FREE_POINTER(cfg->cqmfile); + FREE_POINTER(cfg->roi_file); FREE_POINTER(cfg->fast_coeff_table_fn); FREE_POINTER(cfg->tiles_width_split); FREE_POINTER(cfg->tiles_height_split); @@ -1296,6 +1299,16 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) fclose(f); } + else if OPT("roi-file") + { + char* roifile = strdup(value); + if (!roifile) { + fprintf(stderr, "Failed to allocate memory for roi file name.\n"); + return 0; + } + FREE_POINTER(cfg->roi_file); + cfg->roi_file = roifile; + } else if OPT("set-qp-in-cu") { cfg->set_qp_in_cu = (bool)atobool(value); } diff --git a/src/cli.c b/src/cli.c index 811537b3..4aa86794 100644 --- a/src/cli.c +++ b/src/cli.c @@ -141,6 +141,7 @@ static const struct option long_options[] = { { "force-level", required_argument, NULL, 0 }, { "high-tier", no_argument, NULL, 0 }, { "me-steps", required_argument, NULL, 0 }, + { "roi-file", required_argument, NULL, 0 }, { "fast-residual-cost", required_argument, NULL, 0 }, { "set-qp-in-cu", no_argument, NULL, 0 }, { "open-gop", no_argument, NULL, 0 }, diff --git a/src/encmain.c b/src/encmain.c index 5804c7f8..37f1c121 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -147,6 +147,7 @@ typedef struct { // Parameters passed from main thread to input thread. FILE* input; + FILE* roi_file; const kvz_api *api; const cmdline_opts_t *opts; const encoder_control_t *encoder; @@ -248,6 +249,21 @@ static void* input_read_thread(void* in_args) } } + if(args->roi_file) { + if (fread(&frame_in->roi, 4, 2, args->roi_file) != 2) { + fprintf(stderr, "Failed to read roi matrix size for frame: %d. Shutting down.\n", frames_read); + retval = RETVAL_FAILURE; + goto done; + } + const size_t roi_size = frame_in->roi.height*frame_in->roi.width; + frame_in->roi.roi_array = malloc(roi_size); + if(fread(frame_in->roi.roi_array, 1, roi_size, args->roi_file) != roi_size) { + fprintf(stderr, "Failed to read roi matrix for frame: %d. Shutting down.\n", frames_read); + retval = RETVAL_FAILURE; + goto done; + } + } + frames_read++; if (args->encoder->cfg.source_scan_type != 0) { @@ -427,6 +443,7 @@ int main(int argc, char *argv[]) FILE *input = NULL; //!< input file (YUV) FILE *output = NULL; //!< output file (HEVC NAL stream) FILE *recout = NULL; //!< reconstructed YUV output, --debug + FILE *roifile = NULL; clock_t start_time = clock(); clock_t encoding_start_cpu_time; KVZ_CLOCK_T encoding_start_real_time; @@ -493,6 +510,14 @@ int main(int argc, char *argv[]) goto exit_failure; } + if(opts->config->roi_file) { + roifile = fopen(opts->config->roi_file, "rb"); + if(roifile == NULL) { + fprintf(stderr, "Could not open roi file although it was required. Shutting down!\n"); + goto exit_failure; + } + } + #ifdef _WIN32 // Set stdin and stdout to binary for pipes. if (input == stdin) { @@ -566,9 +591,10 @@ int main(int argc, char *argv[]) // Give arguments via struct to the input thread input_handler_args in_args = { .available_input_slots = available_input_slots, - .filled_input_slots = filled_input_slots, + .filled_input_slots = filled_input_slots, .input = input, + .roi_file = roifile, .api = api, .opts = opts, .encoder = encoder, @@ -805,6 +831,7 @@ done: if (input) fclose(input); if (output) fclose(output); if (recout) fclose(recout); + if (roifile) fclose(roifile); CHECKPOINTS_FINALIZE(); diff --git a/src/encoder.c b/src/encoder.c index e582cc38..dd485e6a 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -416,7 +416,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) // for SMP and AMP partition units. encoder->tr_depth_inter = 0; - if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { + if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.roi_file || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { encoder->max_qp_delta_depth = 0; } else { encoder->max_qp_delta_depth = -1; diff --git a/src/image.c b/src/image.c index ddd58d47..c923e78f 100644 --- a/src/image.c +++ b/src/image.c @@ -100,6 +100,10 @@ kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_ im->interlacing = KVZ_INTERLACING_NONE; + im->roi.roi_array = NULL; + im->roi.width = 0; + im->roi.height = 0; + return im; } @@ -126,6 +130,7 @@ void kvz_image_free(kvz_picture *const im) kvz_image_free(im->base_image); } else { free(im->fulldata_buf); + if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array); } // Make sure freed data won't be used. @@ -186,6 +191,8 @@ kvz_picture *kvz_image_make_subimage(kvz_picture *const orig_image, im->pts = 0; im->dts = 0; + im->roi = orig_image->roi; + return im; } diff --git a/src/kvazaar.h b/src/kvazaar.h index f03ffa27..967a3c67 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -393,6 +393,8 @@ typedef struct kvz_config int8_t *dqps; } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */ + char *roi_file; + unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */ /** @@ -510,6 +512,14 @@ typedef struct kvz_picture { enum kvz_chroma_format chroma_format; int32_t ref_pocs[16]; + + struct + { + int width; + int height; + int8_t *roi_array; + } roi; + } kvz_picture; /** diff --git a/src/rate_control.c b/src/rate_control.c index 4978ae04..e5620fb0 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -1085,7 +1085,25 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, const encoder_control_t * const ctrl = state->encoder_control; lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y); - if (ctrl->cfg.roi.dqps != NULL) { + if (ctrl->cfg.roi.dqps != NULL || state->tile->frame->source->roi.roi_array) { + vector2d_t lcu_vec = { + pos.x + state->tile->lcu_offset_x, + pos.y + state->tile->lcu_offset_y + }; + vector2d_t roi = { + lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu, + lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu + }; + int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width; + int dqp = state->tile->frame->source->roi.roi_array[roi_index]; + if(dqp != 0) { + pos.x = 0; + } + state->qp = CLIP_TO_QP(state->frame->QP + dqp); + state->lambda = qp_to_lambda(state, state->qp); + state->lambda_sqrt = sqrt(state->frame->lambda); + } + else if (ctrl->cfg.roi.dqps != NULL) { vector2d_t lcu = { pos.x + state->tile->lcu_offset_x, pos.y + state->tile->lcu_offset_y @@ -1096,6 +1114,9 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, }; int roi_index = roi.x + roi.y * ctrl->cfg.roi.width; int dqp = ctrl->cfg.roi.dqps[roi_index]; + if (dqp != 0) { + pos.x = 0; + } state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lambda(state, state->qp); state->lambda_sqrt = sqrt(state->lambda); From 917d26f1bf86286523cb9f2545bda363c6c97724 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 6 Feb 2022 20:08:28 +0200 Subject: [PATCH 042/135] Unify delta QP / ROI map functionality. --- README.md | 19 +++- configure.ac | 4 +- doc/kvazaar.1 | 21 ++-- src/cfg.c | 88 ++++------------ src/cli.c | 19 +++- src/encmain.c | 25 ----- src/encode_coding_tree.c | 2 +- src/encoder.c | 125 +++-------------------- src/encoder.h | 2 +- src/encoder_state-bitstream.c | 4 +- src/encoderstate.c | 186 +++++++++++++++++++++++++++++++++- src/encoderstate.h | 6 +- src/filter.c | 2 +- src/kvazaar.h | 17 ++-- src/rate_control.c | 21 +--- 15 files changed, 285 insertions(+), 256 deletions(-) diff --git a/README.md b/README.md index 2daa0fae..5d36012e 100644 --- a/README.md +++ b/README.md @@ -156,11 +156,20 @@ Video structure: - frametile: Constrain within the tile. - frametilemargin: Constrain even more. --roi : Use a delta QP map for region of interest. - Reads an array of delta QP values from a text - file. The file format is: width and height of - the QP delta map followed by width*height delta - QP values in raster order. The map can be of any - size and will be scaled to the video size. + Reads an array of delta QP values from a file. + Text and binary files are supported and detected + from the file extension (.txt/.bin). If a known + extension is not found, the file is treated as + a text file. The file can include one or many + ROI frames each in the following format: + width and height of the QP delta map followed + by width * height delta QP values in raster + order. In binary format, width and height are + 32-bit integers whereas the delta QP values are + signed 8-bit values. The map can be of any size + and will be scaled to the video size. The file + reading will loop if end of the file is reached. + See roi.txt in the examples folder. --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26. in PPS and slice_qp_delta in slize header zero. --(no-)erp-aqp : Use adaptive QP for 360 degree video with diff --git a/configure.ac b/configure.ac index 832b584d..178a9b3d 100644 --- a/configure.ac +++ b/configure.ac @@ -22,8 +22,8 @@ AC_CONFIG_SRCDIR([src/encmain.c]) # - Increment when making new releases and major or minor was not changed since last release. # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html -ver_major=6 -ver_minor=6 +ver_major=7 +ver_minor=0 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS diff --git a/doc/kvazaar.1 b/doc/kvazaar.1 index 93def73f..c5883b84 100644 --- a/doc/kvazaar.1 +++ b/doc/kvazaar.1 @@ -1,4 +1,4 @@ -.TH KVAZAAR "1" "October 2021" "kvazaar v2.1.0" "User Commands" +.TH KVAZAAR "1" "February 2022" "kvazaar v2.1.0" "User Commands" .SH NAME kvazaar \- open source HEVC encoder .SH SYNOPSIS @@ -180,11 +180,20 @@ Constrain movement vectors. [none] .TP \fB\-\-roi Use a delta QP map for region of interest. -Reads an array of delta QP values from a text -file. The file format is: width and height of -the QP delta map followed by width*height delta -QP values in raster order. The map can be of any -size and will be scaled to the video size. +Reads an array of delta QP values from a file. +Text and binary files are supported and detected +from the file extension (.txt/.bin). If a known +extension is not found, the file is treated as +a text file. The file can include one or many +ROI frames each in the following format: +width and height of the QP delta map followed +by width * height delta QP values in raster +order. In binary format, width and height are +32\-bit integers whereas the delta QP values are +signed 8\-bit values. The map can be of any size +and will be scaled to the video size. The file +reading will loop if end of the file is reached. +See roi.txt in the examples folder. .TP \fB\-\-set\-qp\-in\-cu Set QP at CU level keeping pic_init_qp_minus26. diff --git a/src/cfg.c b/src/cfg.c index 6a1fcf40..69745a96 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -139,11 +139,9 @@ int kvz_config_init(kvz_config *cfg) cfg->gop_lp_definition.t = 1; cfg->open_gop = true; - cfg->roi.width = 0; - cfg->roi.height = 0; - cfg->roi.dqps = NULL; - - cfg->roi_file = NULL; + cfg->roi.file_path = NULL; + cfg->roi.format = KVZ_ROI_TXT; + cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -192,12 +190,11 @@ int kvz_config_destroy(kvz_config *cfg) { if (cfg) { FREE_POINTER(cfg->cqmfile); - FREE_POINTER(cfg->roi_file); + FREE_POINTER(cfg->roi.file_path); FREE_POINTER(cfg->fast_coeff_table_fn); FREE_POINTER(cfg->tiles_width_split); FREE_POINTER(cfg->tiles_height_split); FREE_POINTER(cfg->slice_addresses_in_ts); - FREE_POINTER(cfg->roi.dqps); FREE_POINTER(cfg->optional_key); FREE_POINTER(cfg->fastrd_learning_outdir_fn); } @@ -1244,70 +1241,29 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) } else if OPT("implicit-rdpcm") cfg->implicit_rdpcm = (bool)atobool(value); + else if OPT("roi") { - // The ROI description is as follows: - // First number is width, second number is height, - // then follows width * height number of dqp values. - FILE* f = fopen(value, "rb"); - if (!f) { - fprintf(stderr, "Could not open ROI file.\n"); + static enum kvz_roi_format const formats[] = { KVZ_ROI_TXT, KVZ_ROI_BIN }; + static const char * const format_names[] = { "txt", "bin", NULL }; + + char *roi_file = strdup(value); + if (!roi_file) { + fprintf(stderr, "Failed to allocate memory for ROI file name.\n"); return 0; } + FREE_POINTER(cfg->roi.file_path); + cfg->roi.file_path = roi_file; - int width = 0; - int height = 0; - if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) { - fprintf(stderr, "Failed to read ROI size.\n"); - fclose(f); - return 0; + // Get file extension or the substring after the last dot + char *maybe_extension = strrchr(cfg->roi.file_path, '.'); + if (!maybe_extension) { + cfg->roi.format = KVZ_ROI_TXT; + } else { + maybe_extension++; + int8_t format; + bool unknown_format = !parse_enum(maybe_extension, format_names, &format); + cfg->roi.format = unknown_format ? KVZ_ROI_TXT : formats[format]; } - - if (width <= 0 || height <= 0) { - fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height); - fclose(f); - return 0; - } - - if (width > 10000 || height > 10000) { - fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); - fclose(f); - return 0; - } - - const unsigned size = width * height; - int8_t *dqp_array = calloc((size_t)size, sizeof(cfg->roi.dqps[0])); - if (!dqp_array) { - fprintf(stderr, "Failed to allocate memory for ROI table.\n"); - fclose(f); - return 0; - } - - FREE_POINTER(cfg->roi.dqps); - cfg->roi.dqps = dqp_array; - cfg->roi.width = width; - cfg->roi.height = height; - - for (int i = 0; i < size; ++i) { - int number; // Need a pointer to int for fscanf - if (fscanf(f, "%d", &number) != 1) { - fprintf(stderr, "Reading ROI file failed.\n"); - fclose(f); - return 0; - } - dqp_array[i] = CLIP(-51, 51, number); - } - - fclose(f); - } - else if OPT("roi-file") - { - char* roifile = strdup(value); - if (!roifile) { - fprintf(stderr, "Failed to allocate memory for roi file name.\n"); - return 0; - } - FREE_POINTER(cfg->roi_file); - cfg->roi_file = roifile; } else if OPT("set-qp-in-cu") { cfg->set_qp_in_cu = (bool)atobool(value); diff --git a/src/cli.c b/src/cli.c index 4aa86794..b32c10c0 100644 --- a/src/cli.c +++ b/src/cli.c @@ -501,11 +501,20 @@ void print_help(void) " - frametile: Constrain within the tile.\n" " - frametilemargin: Constrain even more.\n" " --roi : Use a delta QP map for region of interest.\n" - " Reads an array of delta QP values from a text\n" - " file. The file format is: width and height of\n" - " the QP delta map followed by width*height delta\n" - " QP values in raster order. The map can be of any\n" - " size and will be scaled to the video size.\n" + " Reads an array of delta QP values from a file.\n" + " Text and binary files are supported and detected\n" + " from the file extension (.txt/.bin). If a known\n" + " extension is not found, the file is treated as\n" + " a text file. The file can include one or many\n" + " ROI frames each in the following format:\n" + " width and height of the QP delta map followed\n" + " by width * height delta QP values in raster\n" + " order. In binary format, width and height are\n" + " 32-bit integers whereas the delta QP values are\n" + " signed 8-bit values. The map can be of any size\n" + " and will be scaled to the video size. The file\n" + " reading will loop if end of the file is reached.\n" + " See roi.txt in the examples folder.\n" " --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26.\n" " in PPS and slice_qp_delta in slize header zero.\n" " --(no-)erp-aqp : Use adaptive QP for 360 degree video with\n" diff --git a/src/encmain.c b/src/encmain.c index 37f1c121..6d172442 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -147,7 +147,6 @@ typedef struct { // Parameters passed from main thread to input thread. FILE* input; - FILE* roi_file; const kvz_api *api; const cmdline_opts_t *opts; const encoder_control_t *encoder; @@ -249,21 +248,6 @@ static void* input_read_thread(void* in_args) } } - if(args->roi_file) { - if (fread(&frame_in->roi, 4, 2, args->roi_file) != 2) { - fprintf(stderr, "Failed to read roi matrix size for frame: %d. Shutting down.\n", frames_read); - retval = RETVAL_FAILURE; - goto done; - } - const size_t roi_size = frame_in->roi.height*frame_in->roi.width; - frame_in->roi.roi_array = malloc(roi_size); - if(fread(frame_in->roi.roi_array, 1, roi_size, args->roi_file) != roi_size) { - fprintf(stderr, "Failed to read roi matrix for frame: %d. Shutting down.\n", frames_read); - retval = RETVAL_FAILURE; - goto done; - } - } - frames_read++; if (args->encoder->cfg.source_scan_type != 0) { @@ -510,14 +494,6 @@ int main(int argc, char *argv[]) goto exit_failure; } - if(opts->config->roi_file) { - roifile = fopen(opts->config->roi_file, "rb"); - if(roifile == NULL) { - fprintf(stderr, "Could not open roi file although it was required. Shutting down!\n"); - goto exit_failure; - } - } - #ifdef _WIN32 // Set stdin and stdout to binary for pipes. if (input == stdin) { @@ -594,7 +570,6 @@ int main(int argc, char *argv[]) .filled_input_slots = filled_input_slots, .input = input, - .roi_file = roifile, .api = api, .opts = opts, .encoder = encoder, diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 90df4dd1..03b04943 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -758,7 +758,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu; bool border = border_x || border_y; /*!< are we in any border CU */ - if (depth <= ctrl->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { state->must_code_qp_delta = true; } diff --git a/src/encoder.c b/src/encoder.c index dd485e6a..67751f56 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -32,9 +32,6 @@ #include "encoder.h" -// This define is required for M_PI on Windows. -#define _USE_MATH_DEFINES -#include #include #include @@ -45,14 +42,6 @@ #include "kvz_math.h" #include "fast_coeff_cost.h" -/** - * \brief Strength of QP adjustments when using adaptive QP for 360 video. - * - * Determined empirically. - */ -static const double ERP_AQP_STRENGTH = 3.0; - - static int encoder_control_init_gop_layer_weights(encoder_control_t * const); static unsigned cfg_num_threads(void) @@ -136,82 +125,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder) } -/** - * \brief Return weight for 360 degree ERP video - * - * Returns the scaling factor of area from equirectangular projection to - * spherical surface. - * - * \param y y-coordinate of the pixel - * \param h height of the picture - */ -static double ws_weight(int y, int h) -{ - return cos((y - 0.5 * h + 0.5) * (M_PI / h)); -} - - - -/** - * \brief Update ROI QPs for 360 video with equirectangular projection. - * - * Writes updated ROI parameters to encoder->cfg.roi. - * - * \param encoder encoder control - * \param orig_roi original delta QPs or NULL - * \param orig_width width of orig_roi - * \param orig_height height of orig_roi - */ -static void init_erp_aqp_roi(encoder_control_t* encoder, - int8_t *orig_roi, - int32_t orig_width, - int32_t orig_height) -{ - // Update ROI with WS-PSNR delta QPs. - int height = encoder->in.height_in_lcu; - int width = orig_roi ? orig_width : 1; - - int frame_height = encoder->in.real_height; - - encoder->cfg.roi.width = width; - encoder->cfg.roi.height = height; - encoder->cfg.roi.dqps = calloc(width * height, sizeof(orig_roi[0])); - - double total_weight = 0.0; - for (int y = 0; y < frame_height; y++) { - total_weight += ws_weight(y, frame_height); - } - - for (int y_lcu = 0; y_lcu < height; y_lcu++) { - int y_orig = LCU_WIDTH * y_lcu; - int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); - - double lcu_weight = 0.0; - for (int y = y_orig; y < y_orig + lcu_height; y++) { - lcu_weight += ws_weight(y, frame_height); - } - // Normalize. - lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); - - int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); - - if (orig_roi) { - // If a ROI array already exists, we copy the existing values to the - // new array while adding qp_delta to each. - int y_roi = y_lcu * orig_height / height; - for (int x = 0; x < width; x++) { - encoder->cfg.roi.dqps[x + y_lcu * width] = - CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta); - } - - } else { - // Otherwise, simply write qp_delta to the ROI array. - encoder->cfg.roi.dqps[y_lcu] = qp_delta; - } - } -} - - /** * \brief Allocate and initialize an encoder control structure. * @@ -353,6 +266,16 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) encoder->scaling_list.use_default_list = 1; } + // ROI / delta QP + if (cfg->roi.file_path) { + const char *mode[2] = { "r", "rb" }; + encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]); + if (!encoder->roi_file) { + fprintf(stderr, "Could not open ROI file.\n"); + goto init_failed; + } + } + if (cfg->fast_coeff_table_fn) { FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb"); if (fast_coeff_table_f == NULL) { @@ -396,32 +319,10 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) goto init_failed; } - if (cfg->erp_aqp) { - init_erp_aqp_roi(encoder, - cfg->roi.dqps, - cfg->roi.width, - cfg->roi.height); - - } else if (cfg->roi.dqps) { - // Copy delta QP array for ROI coding. - const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height; - encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0])); - memcpy(encoder->cfg.roi.dqps, - cfg->roi.dqps, - roi_size * sizeof(*cfg->roi.dqps)); - - } - // NOTE: When tr_depth_inter is equal to 0, the transform is still split // for SMP and AMP partition units. encoder->tr_depth_inter = 0; - if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.roi_file || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { - encoder->max_qp_delta_depth = 0; - } else { - encoder->max_qp_delta_depth = -1; - } - //Tiles encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 || encoder->cfg.tiles_height_count > 1; @@ -724,7 +625,7 @@ void kvz_encoder_control_free(encoder_control_t *const encoder) FREE_POINTER(encoder->tiles_tile_id); - FREE_POINTER(encoder->cfg.roi.dqps); + FREE_POINTER(encoder->cfg.roi.file_path); FREE_POINTER(encoder->cfg.optional_key); kvz_scalinglist_destroy(&encoder->scaling_list); @@ -734,6 +635,10 @@ void kvz_encoder_control_free(encoder_control_t *const encoder) kvz_close_rdcost_outfiles(); + if (encoder->roi_file) { + fclose(encoder->roi_file); + } + free(encoder); } diff --git a/src/encoder.h b/src/encoder.h index 89f6b3a2..24a93f86 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -130,7 +130,7 @@ typedef struct encoder_control_t //! Picture weights when GOP is used. double gop_layer_weights[MAX_GOP_LAYERS]; - int8_t max_qp_delta_depth; + FILE *roi_file; int tr_depth_inter; diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 707103ad..05b934d4 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -503,10 +503,10 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream, WRITE_U(stream, 0, 1, "constrained_intra_pred_flag"); WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag"); - if (encoder->max_qp_delta_depth >= 0) { + if (state->frame->max_qp_delta_depth >= 0) { // Use separate QP for each LCU when rate control is enabled. WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag"); - WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth"); + WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth"); } else { WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag"); } diff --git a/src/encoderstate.c b/src/encoderstate.c index 6bcce76b..6e3cf0b4 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -32,6 +32,9 @@ #include "encoderstate.h" + // This define is required for M_PI on Windows. +#define _USE_MATH_DEFINES +#include #include #include #include @@ -51,6 +54,13 @@ #include "strategies/strategies-picture.h" +/** + * \brief Strength of QP adjustments when using adaptive QP for 360 video. + * + * Determined empirically. + */ +static const double ERP_AQP_STRENGTH = 3.0; + int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) { int i; @@ -570,7 +580,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y); const int cu_width = LCU_WIDTH >> depth; - if (depth <= state->encoder_control->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { *prev_qp = -1; } @@ -650,7 +660,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search); - if (encoder->max_qp_delta_depth >= 0) { + if (state->frame->max_qp_delta_depth >= 0) { int last_qp = state->last_qp; int prev_qp = -1; set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp); @@ -1252,6 +1262,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64) } } + +/** + * \brief Return weight for 360 degree ERP video + * + * Returns the scaling factor of area from equirectangular projection to + * spherical surface. + * + * \param y y-coordinate of the pixel + * \param h height of the picture + */ +static double ws_weight(int y, int h) +{ + return cos((y - 0.5 * h + 0.5) * (M_PI / h)); +} + + +/** + * \brief Update ROI QPs for 360 video with equirectangular projection. + * + * Updates the ROI parameters in frame->roi. + * + * \param encoder encoder control + * \param frame frame that will have the ROI map + */ +static void init_erp_aqp_roi(const encoder_control_t *encoder, kvz_picture *frame) +{ + int8_t *orig_roi = frame->roi.roi_array; + int32_t orig_width = frame->roi.width; + int32_t orig_height = frame->roi.height; + + // Update ROI with WS-PSNR delta QPs. + int new_height = encoder->in.height_in_lcu; + int new_width = orig_roi ? orig_width : 1; + int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0])); + + int frame_height = encoder->in.real_height; + + double total_weight = 0.0; + for (int y = 0; y < frame_height; y++) { + total_weight += ws_weight(y, frame_height); + } + + for (int y_lcu = 0; y_lcu < new_height; y_lcu++) { + int y_orig = LCU_WIDTH * y_lcu; + int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); + + double lcu_weight = 0.0; + for (int y = y_orig; y < y_orig + lcu_height; y++) { + lcu_weight += ws_weight(y, frame_height); + } + // Normalize. + lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); + + int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); + + if (orig_roi) { + // If a ROI array already exists, we copy the existing values to the + // new array while adding qp_delta to each. + int y_roi = y_lcu * orig_height / new_height; + for (int x = 0; x < new_width; x++) { + new_array[x + y_lcu * new_width] = + CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta); + } + + } else { + // Otherwise, simply write qp_delta to the ROI array. + new_array[y_lcu] = qp_delta; + } + } + + // Update new values + frame->roi.width = new_width; + frame->roi.height = new_height; + frame->roi.roi_array = new_array; + FREE_POINTER(orig_roi); +} + + +static void next_roi_frame_from_file(kvz_picture *frame, FILE *file, enum kvz_roi_format format) { + // The ROI description is as follows: + // First number is width, second number is height, + // then follows width * height number of dqp values. + + // Rewind the (seekable) ROI file when end of file is reached. + // Allows a single ROI frame to be used for a whole sequence + // and looping with --loop-input. Skips possible whitespace. + if (ftell(file) != -1L) { + int c = fgetc(file); + while (format == KVZ_ROI_TXT && isspace(c)) c = fgetc(file); + ungetc(c, file); + if (c == EOF) rewind(file); + } + + int *width = &frame->roi.width; + int *height = &frame->roi.height; + + bool failed = false; + + if (format == KVZ_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height); + if (format == KVZ_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2; + + if (failed) { + fprintf(stderr, "Failed to read ROI size.\n"); + fclose(file); + assert(0); + } + + if (*width <= 0 || *height <= 0) { + fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height); + fclose(file); + assert(0); + } + + if (*width > 10000 || *height > 10000) { + fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); + fclose(file); + assert(0); + } + + const unsigned size = (*width) * (*height); + int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0])); + if (!dqp_array) { + fprintf(stderr, "Failed to allocate memory for ROI table.\n"); + fclose(file); + assert(0); + } + + FREE_POINTER(frame->roi.roi_array); + frame->roi.roi_array = dqp_array; + + if (format == KVZ_ROI_TXT) { + for (int i = 0; i < size; ++i) { + int number; // Need a pointer to int for fscanf + if (fscanf(file, "%d", &number) != 1) { + fprintf(stderr, "Reading ROI file failed.\n"); + fclose(file); + assert(0); + } + dqp_array[i] = CLIP(-51, 51, number); + } + } else if (format == KVZ_ROI_BIN) { + if (fread(dqp_array, 1, size, file) != size) { + fprintf(stderr, "Reading ROI file failed.\n"); + assert(0); + } + } +} + static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) { assert(state->type == ENCODER_STATE_TYPE_MAIN); @@ -1265,6 +1423,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict state->tile->frame->height ); + // ROI / delta QP maps + if (frame->roi.roi_array && cfg->roi.file_path) { + assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified."); + } + + // Read frame from the file. If no file is specified, + // ROI data should be already set by the application. + if (cfg->roi.file_path) { + next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format); + } + + if (cfg->erp_aqp) { + init_erp_aqp_roi(state->encoder_control, state->tile->frame->source); + } + // Variance adaptive quantization if (cfg->vaq) { const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1351,6 +1524,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict } // Variance adaptive quantization - END + if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) { + state->frame->max_qp_delta_depth = 0; + } else { + state->frame->max_qp_delta_depth = -1; + } + // Use this flag to handle closed gop irap picture selection. // If set to true, irap is already set and we avoid // setting it based on the intra period @@ -1603,10 +1782,9 @@ lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y) int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp) { - const encoder_control_t *ctrl = state->encoder_control; const cu_array_t *cua = state->tile->frame->cu_array; // Quantization group width - const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); + const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); // Coordinates of the top-left corner of the quantization group const int x_qg = x & ~(qg_width - 1); diff --git a/src/encoderstate.h b/src/encoderstate.h index a65e8b35..00885aa4 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -180,6 +180,8 @@ typedef struct encoder_state_config_frame_t { */ double *aq_offsets; + int8_t max_qp_delta_depth; + /** * \brief Whether next NAL is the first NAL in the access unit. */ @@ -380,10 +382,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state) */ static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth) { - if (state->encoder_control->max_qp_delta_depth < 0) return false; + if (state->frame->max_qp_delta_depth < 0) return false; const int cu_width = LCU_WIDTH >> depth; - const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth; + const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth; const int right = x + cu_width; const int bottom = y + cu_width; return (right % qg_width == 0 || right >= state->tile->frame->width) && diff --git a/src/filter.c b/src/filter.c index d3bdfb7b..510b9ea6 100644 --- a/src/filter.c +++ b/src/filter.c @@ -274,7 +274,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir) static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir) { - if (state->encoder_control->max_qp_delta_depth < 0) { + if (state->frame->max_qp_delta_depth < 0) { return state->qp; } diff --git a/src/kvazaar.h b/src/kvazaar.h index 967a3c67..73c7538d 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -250,6 +250,11 @@ enum kvz_file_format KVZ_FORMAT_YUV = 2 }; +enum kvz_roi_format +{ + KVZ_ROI_TXT = 0, + KVZ_ROI_BIN = 1 +}; // Map from input format to chroma format. #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"[format]) @@ -388,12 +393,9 @@ typedef struct kvz_config int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */ struct { - int32_t width; - int32_t height; - int8_t *dqps; - } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */ - - char *roi_file; + char *file_path; + enum kvz_roi_format format; + } roi; /*!< \brief Specify delta QPs for region of interest coding. */ unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */ @@ -764,6 +766,9 @@ typedef struct kvz_api { * the bitstream, length of the bitstream, the reconstructed frame, the * original frame and frame info in data_out, len_out, pic_out, src_out and * info_out, respectively. Otherwise, set the output parameters to NULL. + * + * Region of interest (ROI) / delta QP map can be specified in the input + * picture's ROI field but only when a ROI file is not used. * * After passing all of the input frames, the caller should keep calling this * function with pic_in set to NULL, until no more data is returned in the diff --git a/src/rate_control.c b/src/rate_control.c index e5620fb0..64983ec1 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -1085,7 +1085,7 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, const encoder_control_t * const ctrl = state->encoder_control; lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y); - if (ctrl->cfg.roi.dqps != NULL || state->tile->frame->source->roi.roi_array) { + if (state->tile->frame->source->roi.roi_array) { vector2d_t lcu_vec = { pos.x + state->tile->lcu_offset_x, pos.y + state->tile->lcu_offset_y @@ -1101,26 +1101,7 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, } state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lambda(state, state->qp); - state->lambda_sqrt = sqrt(state->frame->lambda); - } - else if (ctrl->cfg.roi.dqps != NULL) { - vector2d_t lcu = { - pos.x + state->tile->lcu_offset_x, - pos.y + state->tile->lcu_offset_y - }; - vector2d_t roi = { - lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu, - lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu - }; - int roi_index = roi.x + roi.y * ctrl->cfg.roi.width; - int dqp = ctrl->cfg.roi.dqps[roi_index]; - if (dqp != 0) { - pos.x = 0; - } - state->qp = CLIP_TO_QP(state->frame->QP + dqp); - state->lambda = qp_to_lambda(state, state->qp); state->lambda_sqrt = sqrt(state->lambda); - } else if (ctrl->cfg.target_bitrate > 0) { const uint32_t pixels = MIN(LCU_WIDTH, state->tile->frame->width - LCU_WIDTH * pos.x) * From 85d1a54adc448b85512092e11cb690b4a3297d0b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 17 Mar 2022 14:48:08 +0200 Subject: [PATCH 043/135] Add cli option for forcing inter --- configure.ac | 2 +- src/cfg.c | 4 ++++ src/cli.c | 6 ++++++ src/kvazaar.h | 2 ++ src/search.c | 7 ++++--- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 832b584d..8171fec6 100644 --- a/configure.ac +++ b/configure.ac @@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c]) # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html ver_major=6 -ver_minor=6 +ver_minor=7 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS diff --git a/src/cfg.c b/src/cfg.c index c8a3dfa4..61a23d33 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -185,6 +185,7 @@ int kvz_config_init(kvz_config *cfg) cfg->fastrd_learning_outdir_fn = NULL; cfg->combine_intra_cus = 1; + cfg->force_inter = 0; return 1; } @@ -1426,6 +1427,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) else if OPT("combine-intra-cus") { cfg->combine_intra_cus = atobool(value); } + else if OPT("force-inter") { + cfg->force_inter = atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index 2212aa9b..69fffb3a 100644 --- a/src/cli.c +++ b/src/cli.c @@ -169,6 +169,8 @@ static const struct option long_options[] = { { "fastrd-outdir", required_argument, NULL, 0 }, { "combine-intra-cus", no_argument, NULL, 0 }, { "no-combine-intra-cus", no_argument, NULL, 0 }, + { "force-inter", no_argument, NULL, 0 }, + { "no-force-inter", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -586,6 +588,10 @@ void print_help(void) " be disabled if cus absolutely must not\n" " be larger than limited by the search.\n" " [enabled]" + " --force-inter : Force the encoder to use inter always.\n" + " This is mostly for debugging and is not\n" + " guaranteed to produce sensible bitstream or\n" + " work at all. [disabled]" " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" diff --git a/src/kvazaar.h b/src/kvazaar.h index 0e6779b4..1bd59392 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -482,6 +482,8 @@ typedef struct kvz_config /** \brief whether to try combining intra cus at the lower depth when search * is not performed at said depth*/ uint8_t combine_intra_cus; + + uint8_t force_inter; } kvz_config; /** diff --git a/src/search.c b/src/search.c index d2de84cb..931555f8 100644 --- a/src/search.c +++ b/src/search.c @@ -577,12 +577,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max; bool can_use_intra = - WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || + (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || // When the split was forced because the CTU is partially outside // the frame, we permit intra coding even if pu_depth_intra would // otherwise forbid it. (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width || - (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height; + (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) && + !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I); if (can_use_intra && !skip_intra) { int8_t intra_mode; @@ -710,7 +711,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // If the CU is partially outside the frame, we need to split it even // if pu_depth_intra and pu_depth_inter would not permit it. cur_cu->type == CU_NOTSET || - depth < pu_depth_intra.max || + (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != KVZ_SLICE_I)) || (state->frame->slicetype != KVZ_SLICE_I && depth < pu_depth_inter.max); From d5e4e831f41cd2fb4db3973c396aace58d322c04 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 2 Dec 2021 10:05:21 +0200 Subject: [PATCH 044/135] Preliminary code for outputting bit costs during the search --- src/bitstream.c | 1 + src/cabac.c | 2 ++ src/cabac.h | 30 +++++++++++++++-------- src/encode_coding_tree.c | 4 ++++ src/encoderstate.c | 2 ++ src/global.h | 2 +- src/rdo.h | 2 -- src/sao.c | 6 ++++- src/search.c | 51 ++++++++++++++++++++++++++++------------ src/search.h | 4 ++-- src/search_inter.c | 9 +++++-- src/search_intra.c | 30 +++++++++++++++-------- 12 files changed, 100 insertions(+), 43 deletions(-) diff --git a/src/bitstream.c b/src/bitstream.c index 6a198632..f7433498 100644 --- a/src/bitstream.c +++ b/src/bitstream.c @@ -33,6 +33,7 @@ #include "bitstream.h" #include +#include #include #include diff --git a/src/cabac.c b/src/cabac.c index c0bbb26e..7f5b92c2 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -37,6 +37,8 @@ #include "extras/crypto.h" #include "kvazaar.h" +FILE* bit_cost_file = NULL; + const uint8_t kvz_g_auc_next_state_mps[128] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, diff --git a/src/cabac.h b/src/cabac.h index 3804fdf2..fa17c799 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -42,6 +42,8 @@ #include "bitstream.h" +extern FILE* bit_cost_file; + struct encoder_state_t; // Types @@ -126,6 +128,9 @@ void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, uint32_t max_symbol); void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol); +extern const float kvz_f_entropy_bits[128]; +#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] +extern double bits_written; // Macros #define CTX_STATE(ctx) ((ctx)->uc_state >> 1) @@ -133,24 +138,29 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; } +#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) + #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ - uint32_t prev_state = (data)->ctx->uc_state; \ - kvz_cabac_encode_bin((data), (value)) \ - printf("%s = %u, state = %u -> %u\n", \ - (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); } + uint32_t prev_state = (data)->cur_ctx->uc_state; \ + if(!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (value));\ + kvz_cabac_encode_bin((data), (value)); \ + if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u bits = %f\n", \ + (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx), bits_written); } #define CABAC_BINS_EP(data, value, bins, name) { \ - uint32_t prev_state = (data)->ctx->uc_state; \ + uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bins_ep((data), (value), (bins)); \ - printf("%s = %u(%u bins), state = %u -> %u\n", \ - (name), (uint32_t)(value), (bins), prev_state, (data)->ctx->uc_state); } + if(!(data)->only_count) bits_written += (bins); \ + if(!(data)->only_count) printf("%s = %u(%u bins), state = %u -> %u\n", \ + (name), (uint32_t)(value), (bins), prev_state, (data)->cur_ctx->uc_state); } #define CABAC_BIN_EP(data, value, name) { \ - uint32_t prev_state = (data)->ctx->uc_state; \ + uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bin_ep((data), (value)); \ - printf("%s = %u, state = %u -> %u\n", \ - (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); } + if(!(data)->only_count) bits_written += 1; \ + if(!(data)->only_count) printf("%s = %u, state = %u -> %u\n", \ + (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state); } #else #define CABAC_BIN(data, value, name) \ kvz_cabac_encode_bin((data), (value)); diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 03b04943..0070b718 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -923,6 +923,10 @@ end: if (is_last_cu_in_qg(state, x, y, depth)) { state->last_qp = cur_cu->qp; } + if((x % 64 != 0 && y % 64 != 0) || 1) { + fprintf(stderr, "%f\t%d\t%d\t%d\n", bits_written, x, y, depth); + bits_written = 0; + } } diff --git a/src/encoderstate.c b/src/encoderstate.c index 6e3cf0b4..483dfb6a 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -1655,9 +1655,11 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s } } +double bits_written; void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame) { + bits_written = 0; encoder_state_init_new_frame(state, frame); encoder_state_encode(state); diff --git a/src/global.h b/src/global.h index c6a6ebba..9a2ee989 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -//#define VERBOSE 1 +#define VERBOSE 1 /* CONFIG VARIABLES */ diff --git a/src/rdo.h b/src/rdo.h index 3b56ddcc..dd75fdb9 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -85,7 +85,5 @@ extern const uint32_t kvz_entropy_bits[128]; #define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits[(ctx)->uc_state ^ (val)] // Floating point fractional bits, derived from kvz_entropy_bits -extern const float kvz_f_entropy_bits[128]; -#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] #endif diff --git a/src/sao.c b/src/sao.c index 8da94345..e9fab518 100644 --- a/src/sao.c +++ b/src/sao.c @@ -508,6 +508,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (state->encoder_control->cfg.sao_type & 1){ sao_search_edge_sao(state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left); float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt); + FILE_BITS(mode_bits, 0, 0, 0, "sao mode bits"); int ddistortion = (int)(mode_bits * state->lambda + 0.5); unsigned buf_i; @@ -552,7 +553,9 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ // Choose between SAO and doing nothing, taking into account the // rate-distortion cost of coding do nothing. { - int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5); + float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left); + int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5); + FILE_BITS(mode_bits_none, 0, 0, 0, "Sao cost of nothing"); if (sao_out->ddistortion >= cost_of_nothing) { sao_out->type = SAO_TYPE_NONE; merge_cost[0] = cost_of_nothing; @@ -569,6 +572,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (merge_cand) { unsigned buf_i; float mode_bits = sao_mode_bits_merge(state, i + 1); + FILE_BITS(mode_bits, 0, 0, 0, (i == 0 ? "sao merge ""left" : "sao merge ""top")); int ddistortion = (int)(mode_bits * state->lambda + 0.5); switch (merge_cand->type) { diff --git a/src/search.c b/src/search.c index 7b343d2e..1fc47a06 100644 --- a/src/search.c +++ b/src/search.c @@ -248,7 +248,8 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu) + lcu_t *const lcu, + double *bit_cost) { const int width = LCU_WIDTH >> depth; @@ -272,16 +273,17 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); + *bit_cost += tr_tree_bits; } if (tr_depth > 0) { int offset = width / 2; double sum = 0; - sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); return sum + tr_tree_bits * state->lambda; } @@ -294,6 +296,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[!tr_depth]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + *bit_cost += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + } // SSD between reconstruction and original @@ -310,6 +314,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode); + *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -320,7 +325,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu) + lcu_t *const lcu, + double *bit_cost) { const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; @@ -347,16 +353,17 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); } + *bit_cost += tr_tree_bits; } if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); int sum = 0; - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); return sum + tr_tree_bits * state->lambda; } @@ -380,6 +387,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order); coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order); + *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -690,9 +698,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { - cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); + double bits = 0; + cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); + cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); } double mode_bits; @@ -701,6 +710,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else { mode_bits = inter_bitcost; } + bits += mode_bits; + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + const cabac_ctx_t* ctx = &(state->cabac.ctx.split_flag_model[split_model]); + // bits += CTX_ENTROPY_FBITS(ctx, 0); + FILE_BITS(bits, x, y, depth, "final rd bits"); cost += mode_bits * state->lambda; @@ -746,14 +760,18 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "not split"); split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]); cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; // 2Nx2N + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "not split"); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); } // If skip mode was selected for the block, skip further search. @@ -783,6 +801,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // If the best CU in depth+1 is intra and the biggest it can be, try it. if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { cost = 0; + double bits = 0; cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; @@ -799,11 +818,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode, mode_chroma, NULL, lcu); - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); + cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (has_chroma) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); + cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, &bits); } - + + FILE_BITS(bits, x, y, depth, "merged intra bits"); // Add the cost of coding no-split. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); @@ -979,6 +999,7 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { + if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); assert(x % LCU_WIDTH == 0); assert(y % LCU_WIDTH == 0); diff --git a/src/search.h b/src/search.h index e4b299c3..2ca47c22 100644 --- a/src/search.h +++ b/src/search.h @@ -72,11 +72,11 @@ void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu); + lcu_t *const lcu, double *bits); double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, - lcu_t *const lcu); + lcu_t *const lcu, double* bits); void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index 08594b9f..f8b88509 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1967,6 +1967,8 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); } + + FILE_BITS((double)info->inter_bitcost, x, y, depth, "regular inter bitcost"); } /** @@ -2009,11 +2011,14 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, lcu, false); - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + double bits; + *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); } + FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); + *inter_cost += *inter_bitcost * state->lambda; } diff --git a/src/search_intra.c b/src/search_intra.c index 6d3aa141..bd259e22 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -179,7 +179,8 @@ static double search_intra_trdepth(encoder_state_t * const state, int x_px, int y_px, int depth, int max_depth, int intra_mode, int cost_treshold, cu_info_t *const pred_cu, - lcu_t *const lcu) + lcu_t *const lcu, + double *bit_cost) { assert(depth >= 0 && depth <= MAX_PU_DEPTH); @@ -201,6 +202,7 @@ static double search_intra_trdepth(encoder_state_t * const state, double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; + double nosplit_bits = 0; if (depth > 0) { tr_cu->tr_depth = depth; @@ -221,9 +223,9 @@ static double search_intra_trdepth(encoder_state_t * const state, intra_mode, chroma_mode, pred_cu, lcu); - nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); if (reconstruct_chroma) { - nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); } // Early stop codition for the recursive search. @@ -250,15 +252,15 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth < max_depth && depth < MAX_PU_DEPTH) { split_cost = 3 * state->lambda; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); + split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); } double tr_split_bit = 0.0; @@ -269,6 +271,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth >= 1 && depth <= 3) { const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); + *bit_cost += tr_split_bit; } // Add cost of cbf chroma bits on transform tree. @@ -287,6 +290,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); } + *bit_cost += cbf_bits; } double bits = tr_split_bit + cbf_bits; @@ -608,7 +612,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); + double bit_costs = 0; + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, &bit_costs); costs[rdo_mode] += mode_cost; // Early termination if no coefficients has to be coded @@ -621,6 +626,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Update order according to new costs kvz_sort_modes(modes, costs, modes_to_check); + // The best transform split hierarchy is not saved anywhere, so to get the // transform split hierarchy the search has to be performed again with the // best mode. @@ -632,7 +638,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.intra.mode = modes[0]; pred_cu.intra.mode_chroma = modes[0]; FILL(pred_cu.cbf, 0); - search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu); + double bit_cost = 0; + search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, &bit_cost); + FILE_BITS(bit_cost, x_px, y_px, depth, "tr_depth bits"); } return modes_to_check; @@ -705,9 +713,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, depth, -1, chroma.mode, // skip luma NULL, lcu); - chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + double bits = 0; + chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &bits); double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); + bits += mode_bits; chroma.cost += mode_bits * state->lambda; if (chroma.cost < best_chroma.cost) { From 53264bc764c2ef2354571f83d42fda8381c7d930 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 3 Dec 2021 09:09:57 +0200 Subject: [PATCH 045/135] Update cabac context during search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a separate cabac that is only used during the search. It should hold the state that the actual cabac end after encoding said CU. Only implemented for intra so far. TODO: 4×4 PUs probably still have some problems --- src/cabac.h | 3 +- src/encoderstate.c | 4 +++ src/encoderstate.h | 1 + src/rdo.c | 11 ++++--- src/sao.c | 8 +++--- src/search.c | 71 ++++++++++++++++++++++++++++++---------------- src/search_intra.c | 39 ++++++++++++++++++++----- 7 files changed, 97 insertions(+), 40 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index fa17c799..7dd65a54 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -60,7 +60,8 @@ typedef struct uint32_t buffered_byte; int32_t num_buffered_bytes; int32_t bits_left; - int8_t only_count; + int8_t only_count : 4; + int8_t update : 4; bitstream_t *stream; // CONTEXTS diff --git a/src/encoderstate.c b/src/encoderstate.c index 483dfb6a..012476df 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -731,6 +731,8 @@ static void encoder_state_worker_encode_lcu(void * opaque) kvz_bitstream_align_zero(state->cabac.stream); kvz_cabac_start(&state->cabac); + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; kvz_crypto_delete(&state->crypto_hdl); } @@ -1214,6 +1216,8 @@ static void encoder_state_init_children(encoder_state_t * const state) { //Leaf states have cabac and context kvz_cabac_start(&state->cabac); kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype); + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; } //Clear the jobs diff --git a/src/encoderstate.h b/src/encoderstate.h index 00885aa4..ac62a5a7 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -294,6 +294,7 @@ typedef struct encoder_state_t { bitstream_t stream; cabac_data_t cabac; + cabac_data_t search_cabac; // Crypto stuff crypto_handle_t *crypto_hdl; diff --git a/src/rdo.c b/src/rdo.c index 5403fa61..6b8960ee 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -253,12 +253,12 @@ static INLINE uint32_t get_coeff_cabac_cost( // Take a copy of the CABAC so that we don't overwrite the contexts when // counting the bits. cabac_data_t cabac_copy; - memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy)); + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); // Clear bytes and bits and set mode to "count" cabac_copy.only_count = 1; - cabac_copy.num_buffered_bytes = 0; - cabac_copy.bits_left = 23; + int num_buffered_bytes = cabac_copy.num_buffered_bytes; + int bits_left = cabac_copy.bits_left; // Execute the coding function. // It is safe to drop the const modifier since state won't be modified @@ -270,8 +270,11 @@ static INLINE uint32_t get_coeff_cabac_cost( type, scan_mode, 0); + if(cabac_copy.update) { - return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3); + memcpy(&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); + } + return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3); } static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc) diff --git a/src/sao.c b/src/sao.c index e9fab518..35be7176 100644 --- a/src/sao.c +++ b/src/sao.c @@ -52,7 +52,7 @@ static void init_sao_info(sao_info_t *sao) { static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { @@ -74,7 +74,7 @@ static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t static float sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); @@ -91,7 +91,7 @@ static float sao_mode_bits_edge(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { @@ -131,7 +131,7 @@ static float sao_mode_bits_band(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; + const cabac_data_t * const cabac = &state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { diff --git a/src/search.c b/src/search.c index 1fc47a06..2cb34608 100644 --- a/src/search.c +++ b/src/search.c @@ -245,7 +245,7 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, * Takes into account SSD of reconstruction and the cost of encoding whatever * prediction unit data needs to be coded. */ -double kvz_cu_rd_cost_luma(const encoder_state_t *const state, +double kvz_cu_rd_cost_luma(encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, lcu_t *const lcu, @@ -271,8 +271,12 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, && width > TR_MIN_WIDTH && !intra_split_flag) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); + if (state->search_cabac.update) { + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, tr_depth > 0, "tr_split_search"); + } *bit_cost += tr_tree_bits; } @@ -294,9 +298,14 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[!tr_depth]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); - *bit_cost += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_luma[!tr_depth]); + int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + if (state->search_cabac.update) { + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, is_set, "luma_cbf_search"); + } + *bit_cost += CTX_ENTROPY_FBITS(ctx, is_set); } @@ -346,7 +355,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (depth < MAX_PU_DEPTH) { const int tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); } @@ -494,6 +503,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double inter_zero_coeff_cost = MAX_DOUBLE; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; + cabac_data_t pre_search_cabac; + memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); struct { int32_t min; @@ -699,24 +710,31 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { double bits = 0; - cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); - } + state->search_cabac.update = 1; + + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + state->search_cabac.cur_ctx = ctx; + // TODO: intra 4x4 PUs use different method + bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_BIN(&state->search_cabac, 0, "no_split_search"); double mode_bits; if (cur_cu->type == CU_INTRA) { mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); - } else { + } + else { mode_bits = inter_bitcost; } bits += mode_bits; - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t* ctx = &(state->cabac.ctx.split_flag_model[split_model]); - // bits += CTX_ENTROPY_FBITS(ctx, 0); - FILE_BITS(bits, x, y, depth, "final rd bits"); + cost = mode_bits * state->lambda; - cost += mode_bits * state->lambda; + cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); + } + + FILE_BITS(bits, x, y, depth, "final rd bits"); if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { cost = inter_zero_coeff_cost; @@ -739,7 +757,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->cbf = 0; lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } - } + state->search_cabac.update = 0; + } bool can_split_cu = // If the CU is partially outside the frame, we need to split it even @@ -754,24 +773,27 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int half_cu = cu_width / 2; double split_cost = 0.0; int cbf = cbf_is_set_any(cur_cu->cbf, depth); + cabac_data_t post_seach_cabac; + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "not split"); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; + state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. - const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]); - cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; // 2Nx2N - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "not split"); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN + state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); } // If skip mode was selected for the block, skip further search. @@ -826,7 +848,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, FILE_BITS(bits, x, y, depth, "merged intra bits"); // Add the cost of coding no-split. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // Add the cost of coding intra mode only once. @@ -845,6 +867,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if (depth > 0) { // Copy this CU's mode all the way down for use in adjacent CUs mode // search. + memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac)); work_tree_copy_down(x_local, y_local, depth, work_tree); } } else if (depth >= 0 && depth < MAX_PU_DEPTH) { diff --git a/src/search_intra.c b/src/search_intra.c index bd259e22..ccf1ca91 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -98,11 +98,11 @@ static double get_cost(encoder_state_t * const state, // Add the offset bit costs of signaling 'luma and chroma use trskip', // versus signaling 'luma and chroma don't use trskip' to the SAD cost. - const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma; + const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma; double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0); if (state->encoder_control->chroma_format != KVZ_CSP_400) { - ctx = &state->cabac.ctx.transform_skip_model_chroma; + ctx = &state->search_cabac.ctx.transform_skip_model_chroma; trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); } @@ -269,7 +269,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // Add bits for split_transform_flag = 1, because transform depth search bypasses // the normal recursion in the cost functions. if (depth >= 1 && depth <= 3) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); *bit_cost += tr_split_bit; } @@ -283,7 +283,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (state->encoder_control->chroma_format != KVZ_CSP_400) { const uint8_t tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); } @@ -647,8 +647,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } -double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) +double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) { + cabac_data_t* cabac = &state->search_cabac; double mode_bits; bool mode_in_preds = false; @@ -658,8 +659,23 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const } } - const cabac_ctx_t *ctx = &(state->cabac.ctx.intra_mode_model); + const cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds); + if (state->search_cabac.update) { + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, mode_in_preds, "prev_intra_luma_pred_flag_search"); + if(mode_in_preds) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); + if(luma_mode != intra_preds[0]) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx"); + } + } + else { + // This value should be transformed for actual coding, + // but here the value does not actually matter, just that we write 5 bits + CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode"); + } + } if (mode_in_preds) { mode_bits += ((luma_mode == intra_preds[0]) ? 1 : 2); @@ -673,13 +689,22 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model[0]); + cabac_data_t* cabac = &state->search_cabac; + const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); double mode_bits; if (chroma_mode == luma_mode) { mode_bits = CTX_ENTROPY_FBITS(ctx, 0); } else { mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1); } + if(cabac->update) { + cabac->cur_ctx = ctx; + CABAC_BIN(cabac, chroma_mode != luma_mode, "intra_chroma_pred_mode"); + if(chroma_mode != luma_mode) { + // Again it does not matter what we actually write here + CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode"); + } + } return mode_bits; } From 9ed8d0a7d9e6570763ca31e4a928ecb7fec8b3b9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 7 Dec 2021 08:13:08 +0200 Subject: [PATCH 046/135] count all non-tr-depth related bits correctly --- src/cabac.c | 1 + src/search.c | 74 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index 7f5b92c2..5842edbe 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -97,6 +97,7 @@ void kvz_cabac_start(cabac_data_t * const data) data->num_buffered_bytes = 0; data->buffered_byte = 0xff; data->only_count = 0; // By default, write bits out + data->update = 0; } /** diff --git a/src/search.c b/src/search.c index 2cb34608..a0534bf4 100644 --- a/src/search.c +++ b/src/search.c @@ -265,17 +265,27 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, const uint8_t tr_depth = tr_cu->tr_depth - depth; + cabac_data_t* cabac = &state->search_cabac; + // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; + int max_tr_depth; + if (tr_cu->type == CU_INTRA) { + max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag; + } + else { + max_tr_depth = state->encoder_control->tr_depth_inter; + } if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH - && !intra_split_flag) + && !intra_split_flag + && tr_depth < max_tr_depth) { - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + const cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); - if (state->search_cabac.update) { - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, tr_depth > 0, "tr_split_search"); + if (cabac->update) { + cabac->cur_ctx = ctx; + CABAC_BIN(cabac, tr_depth > 0, "tr_split_search"); } *bit_cost += tr_tree_bits; } @@ -298,14 +308,28 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_luma[!tr_depth]); + const cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); - if (state->search_cabac.update) { + if (cabac->update) { + // Because these need to be coded before the luma cbf they also need to be counted + // before the cabac state changes. However, since this branch is only executed when + // calculating the last RD cost it is not problem to include the chroma cbf costs in + // luma, because the chroma cost is calculated right after the luma cost. + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + const cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac->cur_ctx = cr_ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, u_is_set); + CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); + tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, v_is_set); + CABAC_BIN(cabac, v_is_set, "cbf_cr_search"); + } + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + *bit_cost += tr_tree_bits; state->search_cabac.cur_ctx = ctx; CABAC_BIN(&state->search_cabac, is_set, "luma_cbf_search"); } - *bit_cost += CTX_ENTROPY_FBITS(ctx, is_set); } @@ -353,7 +377,8 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, return 0; } - if (depth < MAX_PU_DEPTH) { + // See luma for why the second condition + if (depth < MAX_PU_DEPTH && !state->search_cabac.update) { const int tr_depth = depth - pred_cu->depth; const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { @@ -712,12 +737,21 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double bits = 0; state->search_cabac.update = 1; - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - state->search_cabac.cur_ctx = ctx; - // TODO: intra 4x4 PUs use different method - bits += CTX_ENTROPY_FBITS(ctx, 0); - CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + if(depth < MAX_DEPTH) { + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + state->search_cabac.cur_ctx = ctx; + bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + } + else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { + // Add cost of intra part_size. + const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN + state->search_cabac.cur_ctx = ctx; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); + } double mode_bits; if (cur_cu->type == CU_INTRA) { @@ -776,6 +810,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_data_t post_seach_cabac; memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); + state->search_cabac.update = 1; if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. @@ -792,9 +827,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 0, "split_search"); } + state->search_cabac.update = 0; // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting @@ -1023,6 +1059,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); assert(y % LCU_WIDTH == 0); From d2299adb1c301d1a3d723cf7832a81bfb0e6bc9e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 7 Dec 2021 09:11:47 +0200 Subject: [PATCH 047/135] Disable bit debug code when VERBOSE is not defined and count bits when combining the intra cus --- src/cabac.h | 6 +++++- src/encode_coding_tree.c | 3 +++ src/global.h | 2 +- src/search.c | 29 +++++++++++++++++++++++++++-- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index 7dd65a54..59fb448c 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -139,7 +139,11 @@ extern double bits_written; #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; } -#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) +#ifdef VERBOSE +#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) +#else +#define FILE_BITS(bits, x, y, depth, name) {} +#endif #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 0070b718..aa083f5b 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -923,10 +923,13 @@ end: if (is_last_cu_in_qg(state, x, y, depth)) { state->last_qp = cur_cu->qp; } +#ifdef VERBOSE if((x % 64 != 0 && y % 64 != 0) || 1) { fprintf(stderr, "%f\t%d\t%d\t%d\n", bits_written, x, y, depth); bits_written = 0; } +#endif + } diff --git a/src/global.h b/src/global.h index 9a2ee989..2ad0830b 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -#define VERBOSE 1 +// #define VERBOSE 1 /* CONFIG VARIABLES */ diff --git a/src/search.c b/src/search.c index a0534bf4..ce521e23 100644 --- a/src/search.c +++ b/src/search.c @@ -854,12 +854,31 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, && x + cu_width <= frame->width && y + cu_width <= frame->height && state->encoder_control->cfg.combine_intra_cus) { + cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); // If the best CU in depth+1 is intra and the biggest it can be, try it. if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { + cabac_data_t temp_cabac; + memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; double bits = 0; + if (depth < MAX_DEPTH) { + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + state->search_cabac.cur_ctx = ctx; + bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + } + else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { + // Add cost of intra part_size. + const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN + state->search_cabac.cur_ctx = ctx; + FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); + CABAC_BIN(&state->search_cabac, 1, "split_search"); + } cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; @@ -876,6 +895,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode, mode_chroma, NULL, lcu); + double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); + cost += mode_bits * state->lambda; + cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (has_chroma) { cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, &bits); @@ -888,8 +910,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // Add the cost of coding intra mode only once. - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); - cost += mode_bits * state->lambda; + + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); } } @@ -1058,7 +1081,9 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { +#ifdef VERBOSE if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); +#endif memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); From dacc15f33be504cf4a28da6cb23a3517e4cbb70f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 8 Dec 2021 10:27:07 +0200 Subject: [PATCH 048/135] Count pred mode bit --- src/search.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/search.c b/src/search.c index ce521e23..bf37640f 100644 --- a/src/search.c +++ b/src/search.c @@ -628,6 +628,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); } } + cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 0) * state->lambda; } } @@ -654,6 +655,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double intra_cost; kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_cost); + if(state->frame->slicetype != KVZ_SLICE_I) { + intra_cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 1) * state->lambda; + } if (intra_cost < cost) { cost = intra_cost; cur_cu->type = CU_INTRA; @@ -755,6 +759,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double mode_bits; if (cur_cu->type == CU_INTRA) { + cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); + bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, 1, "pred_mode"); mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); } else { From de3a76d8747fc9ddb35375ae5d25b8dee51ff27a Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 8 Dec 2021 11:48:46 +0200 Subject: [PATCH 049/135] Correctly calculate bits for transform split, however updating is done incorrectly, but cannot be fixed easily unfortunately --- src/search.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/search.c b/src/search.c index bf37640f..c1947957 100644 --- a/src/search.c +++ b/src/search.c @@ -245,7 +245,7 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, * Takes into account SSD of reconstruction and the cost of encoding whatever * prediction unit data needs to be coded. */ -double kvz_cu_rd_cost_luma(encoder_state_t *const state, +double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, lcu_t *const lcu, @@ -265,7 +265,7 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, const uint8_t tr_depth = tr_cu->tr_depth - depth; - cabac_data_t* cabac = &state->search_cabac; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; @@ -308,9 +308,9 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (cabac->update) { + if (cabac->update && tr_cu->tr_depth == 0) { // Because these need to be coded before the luma cbf they also need to be counted // before the cabac state changes. However, since this branch is only executed when // calculating the last RD cost it is not problem to include the chroma cbf costs in @@ -325,10 +325,12 @@ double kvz_cu_rd_cost_luma(encoder_state_t *const state, tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, v_is_set); CABAC_BIN(cabac, v_is_set, "cbf_cr_search"); } - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); - *bit_cost += tr_tree_bits; - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, is_set, "luma_cbf_search"); + } + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + *bit_cost += tr_tree_bits; + if(cabac->update) { + cabac->cur_ctx = ctx; + CABAC_BIN(cabac, is_set, "luma_cbf_search"); } } @@ -378,14 +380,20 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } // See luma for why the second condition - if (depth < MAX_PU_DEPTH && !state->search_cabac.update) { + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth)) { const int tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); + cabac->cur_ctx = ctx; if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, u_is_set); + if(state->search_cabac.update) CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); } if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, v_is_set); + if (state->search_cabac.update) CABAC_BIN(cabac, v_is_set, "cbf_cb_search"); } *bit_cost += tr_tree_bits; } From 1fb69d5e2271d750dc8b68a02bffaab8fd300fb5 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 9 Dec 2021 13:19:42 +0200 Subject: [PATCH 050/135] Correct tr depth bit calculation --- src/search.c | 22 +++++++++++++--------- src/search_intra.c | 2 +- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/search.c b/src/search.c index c1947957..cd4c67b7 100644 --- a/src/search.c +++ b/src/search.c @@ -270,7 +270,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; int max_tr_depth; - if (tr_cu->type == CU_INTRA) { + if (pred_cu->type == CU_INTRA) { max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag; } else { @@ -279,9 +279,9 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag - && tr_depth < max_tr_depth) + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) { - const cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); + cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); if (cabac->update) { cabac->cur_ctx = ctx; @@ -310,11 +310,13 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (cabac->update && tr_cu->tr_depth == 0) { + if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { // Because these need to be coded before the luma cbf they also need to be counted // before the cabac state changes. However, since this branch is only executed when // calculating the last RD cost it is not problem to include the chroma cbf costs in // luma, because the chroma cost is calculated right after the luma cost. + // However, if we have different tr_depth, the bits cannot be written in correct + // order anyways so do not touch the chroma cbf here. if (state->encoder_control->chroma_format != KVZ_CSP_400) { const cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); cabac->cur_ctx = cr_ctx; @@ -380,7 +382,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } // See luma for why the second condition - if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth)) { + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth)) { const int tr_depth = depth - pred_cu->depth; cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); @@ -767,10 +769,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double mode_bits; if (cur_cu->type == CU_INTRA) { - cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); - bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, 1, "pred_mode"); + if(state->frame->slicetype != KVZ_SLICE_I) { + cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); + bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra + state->search_cabac.cur_ctx = ctx; + CABAC_BIN(&state->search_cabac, 1, "pred_mode"); + } mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); } else { diff --git a/src/search_intra.c b/src/search_intra.c index ccf1ca91..ac72bd44 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -250,7 +250,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // max_depth. // - Min transform size hasn't been reached (MAX_PU_DEPTH). if (depth < max_depth && depth < MAX_PU_DEPTH) { - split_cost = 3 * state->lambda; + split_cost = 0; split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); if (split_cost < nosplit_cost) { From 311fceade7f8c94009f8b7b68b9fe6da729862ab Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 Dec 2021 08:30:06 +0200 Subject: [PATCH 051/135] Force use inter --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index f8b88509..57e163f4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1968,7 +1968,6 @@ static void search_pu_inter(encoder_state_t * const state, kvz_sort_keys_by_cost(&amvp[2]); } - FILE_BITS((double)info->inter_bitcost, x, y, depth, "regular inter bitcost"); } /** @@ -2122,6 +2121,7 @@ void kvz_search_cu_inter(encoder_state_t * const state, if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } + FILE_BITS((double)*inter_bitcost, x, y, depth, "regular inter bitcost"); } From d8648fe1de496e49cf92baf2d5f1ab70425fb3cd Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 13 Dec 2021 10:43:19 +0200 Subject: [PATCH 052/135] Preparation for accurately counting inter bits --- src/encode_coding_tree.c | 20 +++++-- src/encode_coding_tree.h | 6 +++ src/fast_coeff_cost.c | 6 +-- src/fast_coeff_cost.h | 2 +- src/global.h | 2 +- src/inter.c | 4 +- src/inter.h | 2 +- src/search.c | 14 ++--- src/search.h | 2 +- src/search_inter.c | 112 +++++++++++++++++++-------------------- src/search_inter.h | 6 +-- 11 files changed, 96 insertions(+), 80 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index aa083f5b..76f0cc7e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -308,11 +308,11 @@ static void encode_transform_coeff(encoder_state_t * const state, } } -static void encode_inter_prediction_unit(encoder_state_t * const state, +void kvz_encode_inter_prediction_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int x, int y, int width, int height, - int depth) + int depth, lcu_t* lcu) { // Mergeflag int16_t num_cand = 0; @@ -385,10 +385,20 @@ static void encode_inter_prediction_unit(encoder_state_t * const state, if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) { int16_t mv_cand[2][2]; - kvz_inter_get_mv_cand_cua( + if (lcu) { + kvz_inter_get_mv_cand( + state, + x, y, width, height, + mv_cand, cur_cu, + lcu, ref_list_idx); + } + else { + kvz_inter_get_mv_cand_cua( state, x, y, width, height, - mv_cand, cur_cu, ref_list_idx); + mv_cand, cur_cu, ref_list_idx + ); + } uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); const int32_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; @@ -855,7 +865,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth); + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL); } { diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 4832eeb1..a3f95b36 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -51,6 +51,12 @@ void kvz_encode_mvd(encoder_state_t * const state, int32_t mvd_hor, int32_t mvd_ver); +void kvz_encode_inter_prediction_unit(encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int width, int height, + int depth, lcu_t* lcu); + void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, diff --git a/src/fast_coeff_cost.c b/src/fast_coeff_cost.c index d769791d..1abb5114 100644 --- a/src/fast_coeff_cost.c +++ b/src/fast_coeff_cost.c @@ -40,7 +40,7 @@ static uint16_t to_q88(float f) return (uint16_t)(f * 256.0f + 0.5f); } -static uint64_t to_4xq88(const float f[4]) +static uint64_t to_4xq88(const double f[4]) { int i; uint64_t result = 0; @@ -58,9 +58,9 @@ int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_ uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp; for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) { - float curr_wts[4]; + double curr_wts[4]; - if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0, + if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0, curr_wts + 1, curr_wts + 2, curr_wts + 3) != 4) { diff --git a/src/fast_coeff_cost.h b/src/fast_coeff_cost.h index 5ae6dc25..dee647f7 100644 --- a/src/fast_coeff_cost.h +++ b/src/fast_coeff_cost.h @@ -45,7 +45,7 @@ typedef struct { // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from // 0 to MAX_FAST_COEFF_COST_QP -static const float default_fast_coeff_cost_wts[][4] = { +static const double default_fast_coeff_cost_wts[][4] = { // Just extend it by stretching the first actual values.. {0.164240, 4.161530, 3.509033, 6.928047}, {0.164240, 4.161530, 3.509033, 6.928047}, diff --git a/src/global.h b/src/global.h index 2ad0830b..9a2ee989 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -// #define VERBOSE 1 +#define VERBOSE 1 /* CONFIG VARIABLES */ diff --git a/src/inter.c b/src/inter.c index 02ea1a95..d6b83090 100644 --- a/src/inter.c +++ b/src/inter.c @@ -1228,7 +1228,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, int32_t width, int32_t height, const merge_candidates_t *merge_cand, - const cu_info_t *cur_cu, + const cu_info_t * const cur_cu, int8_t reflist, int16_t mv_cand[2][2]) { @@ -1335,7 +1335,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, int16_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t * const cur_cu, lcu_t *lcu, int8_t reflist) { diff --git a/src/inter.h b/src/inter.h index 1a46e98a..7b5c4ea7 100644 --- a/src/inter.h +++ b/src/inter.h @@ -88,7 +88,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, int16_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t* cur_cu, lcu_t *lcu, int8_t reflist); diff --git a/src/search.c b/src/search.c index cd4c67b7..553c4380 100644 --- a/src/search.c +++ b/src/search.c @@ -318,7 +318,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, // However, if we have different tr_depth, the bits cannot be written in correct // order anyways so do not touch the chroma cbf here. if (state->encoder_control->chroma_format != KVZ_CSP_400) { - const cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); cabac->cur_ctx = cr_ctx; int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); @@ -536,7 +536,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int cu_width = LCU_WIDTH >> depth; double cost = MAX_DOUBLE; double inter_zero_coeff_cost = MAX_DOUBLE; - uint32_t inter_bitcost = MAX_INT; + double inter_bitcost = MAX_INT; cu_info_t *cur_cu; cabac_data_t pre_search_cabac; memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); @@ -600,7 +600,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (can_use_inter) { double mode_cost; - uint32_t mode_bitcost; + double mode_bitcost; kvz_search_cu_inter(state, x, y, depth, @@ -760,7 +760,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. - const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); @@ -835,7 +835,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); @@ -844,7 +844,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "split"); @@ -893,7 +893,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. - const cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN state->search_cabac.cur_ctx = ctx; FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); diff --git a/src/search.h b/src/search.h index 2ca47c22..b11a0ad5 100644 --- a/src/search.h +++ b/src/search.h @@ -59,7 +59,7 @@ typedef struct unit_stats_map_t { cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units double cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs - uint32_t bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs + double bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs int8_t keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays int size; //!< number of active elements in the lists } unit_stats_map_t; diff --git a/src/search_inter.c b/src/search_inter.c index 57e163f4..983ffcc8 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -199,15 +199,15 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int * \return true if best_mv was changed, false otherwise */ static bool check_mv_cost(inter_search_info_t *info, - int x, - int y, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + int x, + int y, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { if (!intmv_within_tile(info, x, y)) return false; - uint32_t bitcost = 0; + double bitcost = 0; double cost = kvz_image_calc_sad( info->pic, info->ref, @@ -292,10 +292,10 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv) * best_mv to the best one. */ static void select_starting_point(inter_search_info_t *info, - vector2d_t extra_mv, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv); @@ -394,9 +394,9 @@ static double calc_mvd_cost(const encoder_state_t *state, inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost) + double* bitcost) { - uint32_t temp_bitcost = 0; + double temp_bitcost = 0; uint32_t merge_idx; int8_t merged = 0; @@ -429,9 +429,9 @@ static double calc_mvd_cost(const encoder_state_t *state, static bool early_terminate(inter_search_info_t *info, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { static const vector2d_t small_hexbs[7] = { { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, @@ -485,7 +485,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info, vector2d_t mv, int *best_dist, double *best_cost, - uint32_t *best_bits, + double* best_bits, vector2d_t *best_mv) { assert(pattern_type < 4); @@ -603,7 +603,7 @@ void kvz_tz_raster_search(inter_search_info_t *info, int iSearchRange, int iRaster, double *best_cost, - uint32_t *best_bits, + double* best_bits, vector2d_t *best_mv) { const vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; @@ -618,10 +618,10 @@ void kvz_tz_raster_search(inter_search_info_t *info, static void tz_search(inter_search_info_t *info, - vector2d_t extra_mv, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { //TZ parameters const int iSearchRange = 96; // search range for each stage @@ -705,11 +705,11 @@ static void tz_search(inter_search_info_t *info, * points like 0,0 might be used, such as vectors from top or left. */ static void hexagon_search(inter_search_info_t *info, - vector2d_t extra_mv, - uint32_t steps, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -803,11 +803,11 @@ static void hexagon_search(inter_search_info_t *info, * points like 0,0 might be used, such as vectors from top or left. **/ static void diamond_search(inter_search_info_t *info, - vector2d_t extra_mv, - uint32_t steps, - double *best_cost, - uint32_t *best_bits, - vector2d_t *best_mv) + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { enum diapos { DIA_UP = 0, @@ -888,7 +888,7 @@ static void search_mv_full(inter_search_info_t *info, int32_t search_range, vector2d_t extra_mv, double *best_cost, - uint32_t *best_bits, + double* best_bits, vector2d_t *best_mv) { // Search around the 0-vector. @@ -968,7 +968,7 @@ static void search_mv_full(inter_search_info_t *info, */ static void search_frac(inter_search_info_t *info, double *best_cost, - uint32_t *best_bits, + double *best_bits, vector2d_t *best_mv) { // Map indexes to relative coordinates in the following way: @@ -985,8 +985,8 @@ static void search_frac(inter_search_info_t *info, vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; double cost = MAX_DOUBLE; - uint32_t bitcost = 0; - uint32_t bitcosts[4] = { 0 }; + double bitcost = 0; + double bitcosts[4] = { 0 }; unsigned best_index = 0; // Keep this as unsigned until SAD / SATD functions are updated @@ -1314,7 +1314,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, } double best_cost = MAX_DOUBLE; - uint32_t best_bits = MAX_INT; + double best_bits = MAX_INT; // Select starting point from among merge candidates. These should // include both mv_cand vectors and (0, 0). @@ -1338,12 +1338,12 @@ static void search_pu_inter_ref(inter_search_info_t *info, case KVZ_IME_DIA: diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); + &best_cost, &best_bits, &best_mv); break; default: hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, - &best_cost, &best_bits, &best_mv); + &best_cost, &best_bits, &best_mv); break; } } @@ -1484,7 +1484,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->width); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; cost += info->mvd_cost_func(info->state, merge_cand[i].mv[0][0], @@ -1827,7 +1827,7 @@ static void search_pu_inter(encoder_state_t * const state, list); double frac_cost = MAX_DOUBLE; - uint32_t frac_bits = MAX_INT; + double frac_bits = MAX_INT; vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; search_frac(info, &frac_cost, &frac_bits, &frac_mv); @@ -1917,7 +1917,7 @@ static void search_pu_inter(encoder_state_t * const state, best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; best_bipred_cost += info->mvd_cost_func(info->state, bipred_pu->inter.mv[0][0], @@ -1990,10 +1990,10 @@ static void search_pu_inter(encoder_state_t * const state, * \param inter_bitcost Return inter bitcost */ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost){ + int x, int y, int depth, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost){ cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); int tr_depth = MAX(1, depth); @@ -2040,7 +2040,7 @@ void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; @@ -2108,10 +2108,10 @@ void kvz_search_cu_inter(encoder_state_t * const state, // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + x, y, depth, + lcu, + inter_cost, + inter_bitcost); } if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { @@ -2146,7 +2146,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; @@ -2173,7 +2173,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, const int height_pu = PU_GET_H(part_mode, width, i); double cost = MAX_DOUBLE; - uint32_t bitcost = MAX_INT; + double bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); @@ -2250,10 +2250,10 @@ void kvz_search_cu_smp(encoder_state_t * const state, // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + x, y, depth, + lcu, + inter_cost, + inter_bitcost); } else { *inter_cost += state->lambda_sqrt * smp_extra_bits; } diff --git a/src/search_inter.h b/src/search_inter.h index 8b4b16f2..bb9067c5 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -71,13 +71,13 @@ typedef double kvz_mvd_cost_func(const encoder_state_t *state, inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost); + double *bitcost); void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); void kvz_search_cu_smp(encoder_state_t * const state, int x, int y, @@ -85,7 +85,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); unsigned kvz_inter_satd_cost(const encoder_state_t* state, From 4b8d217f2dcc7dfba3e3abd17ca6c95013437d5b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 13 Dec 2021 12:23:16 +0200 Subject: [PATCH 053/135] Add new macro for potentially updating cabac context when obtaining the bit cost --- src/cabac.h | 8 +++++ src/sao.c | 50 +++++++++++++++-------------- src/search.c | 78 ++++++++++++++-------------------------------- src/search_intra.c | 27 +++++++--------- 4 files changed, 70 insertions(+), 93 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index 59fb448c..8f0c7c70 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -131,6 +131,14 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol extern const float kvz_f_entropy_bits[128]; #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] + +#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ + (bits) += kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]; \ + if((cabac)->update) {\ + (cabac)->cur_ctx = ctx;\ + CABAC_BIN((cabac), (val), (name));\ + } \ +} while(0) extern double bits_written; // Macros diff --git a/src/sao.c b/src/sao.c index 35be7176..179f4311 100644 --- a/src/sao.c +++ b/src/sao.c @@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) { } -static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) +static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, none = 0 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type"); return mode_bits; } -static float sao_mode_bits_merge(const encoder_state_t * const state, +static double sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag"); if (merge_cand == 1) return mode_bits; - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag"); return mode_bits; } -static float sao_mode_bits_edge(const encoder_state_t * const state, +static double sao_mode_bits_edge(const encoder_state_t * const state, int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { - ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + ctx = &(cabac->ctx.sao_merge_flag_model); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, edge = 2 = cMax ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) { @@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state, } -static float sao_mode_bits_band(const encoder_state_t * const state, +static double sao_mode_bits_band(const encoder_state_t * const state, int band_position[2], int offsets[10], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->search_cabac; + double mode_bits = 0.0; + cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded sao_type_idx_, band = 1 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets and possible FL coded offset signs. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) diff --git a/src/search.c b/src/search.c index 553c4380..43a07d4b 100644 --- a/src/search.c +++ b/src/search.c @@ -282,11 +282,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) { cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); - if (cabac->update) { - cabac->cur_ctx = ctx; - CABAC_BIN(cabac, tr_depth > 0, "tr_split_search"); - } + CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); *bit_cost += tr_tree_bits; } @@ -318,23 +314,16 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, // However, if we have different tr_depth, the bits cannot be written in correct // order anyways so do not touch the chroma cbf here. if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cabac_ctx_t* cr_ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); cabac->cur_ctx = cr_ctx; int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); - tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, u_is_set); - CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); - tr_tree_bits += CTX_ENTROPY_FBITS(cr_ctx, v_is_set); - CABAC_BIN(cabac, v_is_set, "cbf_cr_search"); + CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } } - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, is_set); + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); *bit_cost += tr_tree_bits; - if(cabac->update) { - cabac->cur_ctx = ctx; - CABAC_BIN(cabac, is_set, "luma_cbf_search"); - } - } // SSD between reconstruction and original @@ -389,13 +378,11 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, cabac->cur_ctx = ctx; if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, u_is_set); - if(state->search_cabac.update) CABAC_BIN(cabac, u_is_set, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); } if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, v_is_set); - if (state->search_cabac.update) CABAC_BIN(cabac, v_is_set, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } *bit_cost += tr_tree_bits; } @@ -638,7 +625,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); } } - cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 0) * state->lambda; + double pred_mode_type_bits = 0; + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 0, pred_mode_type_bits, "pred_mode_flag"); + cost += pred_mode_type_bits * state->lambda; } } @@ -666,7 +655,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_cost); if(state->frame->slicetype != KVZ_SLICE_I) { - intra_cost += CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 1) * state->lambda; + double pred_mode_type_bits = 0; + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); + intra_cost += pred_mode_type_bits * state->lambda; } if (intra_cost < cost) { cost = intra_cost; @@ -754,26 +745,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if(depth < MAX_DEPTH) { uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - state->search_cabac.cur_ctx = ctx; - bits += CTX_ENTROPY_FBITS(ctx, 0); - CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); } else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); } double mode_bits; if (cur_cu->type == CU_INTRA) { if(state->frame->slicetype != KVZ_SLICE_I) { cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); - bits += CTX_ENTROPY_FBITS(ctx, 1); // Intra - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, 1, "pred_mode"); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, bits, "pred_mode_flag"); } mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); } @@ -832,25 +816,22 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); state->search_cabac.update = 1; + double split_bits = 0; + if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, split_bits, "split_search"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); - split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 0), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 0, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } state->search_cabac.update = 0; + split_cost += split_bits * state->lambda; // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting @@ -887,17 +868,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (depth < MAX_DEPTH) { uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - state->search_cabac.cur_ctx = ctx; - bits += CTX_ENTROPY_FBITS(ctx, 0); - CABAC_BIN(&state->search_cabac, 0, "no_split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); } else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - bits += CTX_ENTROPY_FBITS(ctx, 1); // NxN - state->search_cabac.cur_ctx = ctx; - FILE_BITS(CTX_ENTROPY_FBITS(ctx, 1), x, y, depth, "split"); - CABAC_BIN(&state->search_cabac, 1, "split_search"); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, bits, "no_split_search"); } cur_cu->intra = cu_d1->intra; @@ -915,7 +891,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode, mode_chroma, NULL, lcu); - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); + double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y) + bits; cost += mode_bits * state->lambda; cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); @@ -924,12 +900,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } FILE_BITS(bits, x, y, depth, "merged intra bits"); - // Add the cost of coding no-split. - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - - // Add the cost of coding intra mode only once. memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); diff --git a/src/search_intra.c b/src/search_intra.c index ac72bd44..e29f29a3 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -270,7 +270,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // the normal recursion in the cost functions. if (depth >= 1 && depth <= 3) { const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); - tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split"); *bit_cost += tr_split_bit; } @@ -285,10 +285,10 @@ static double search_intra_trdepth(encoder_state_t * const state, const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); } if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); } *bit_cost += cbf_bits; } @@ -650,7 +650,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) { cabac_data_t* cabac = &state->search_cabac; - double mode_bits; + double mode_bits = 0; bool mode_in_preds = false; for (int i = 0; i < 3; ++i) { @@ -660,10 +660,8 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t } const cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); - mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds); + CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search"); if (state->search_cabac.update) { - state->search_cabac.cur_ctx = ctx; - CABAC_BIN(&state->search_cabac, mode_in_preds, "prev_intra_luma_pred_flag_search"); if(mode_in_preds) { CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); if(luma_mode != intra_preds[0]) { @@ -689,17 +687,16 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { - cabac_data_t* cabac = &state->search_cabac; + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); - double mode_bits; - if (chroma_mode == luma_mode) { - mode_bits = CTX_ENTROPY_FBITS(ctx, 0); - } else { - mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1); + + double mode_bits = 0; + CABAC_FBITS_UPDATE(cabac, ctx, chroma_mode != luma_mode, mode_bits, "intra_chroma_pred_mode"); + if (chroma_mode != luma_mode) { + mode_bits += 2.0; } + if(cabac->update) { - cabac->cur_ctx = ctx; - CABAC_BIN(cabac, chroma_mode != luma_mode, "intra_chroma_pred_mode"); if(chroma_mode != luma_mode) { // Again it does not matter what we actually write here CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode"); From aea1133e6a48715be04d738e500aaccd6d0b871e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 08:40:23 +0200 Subject: [PATCH 054/135] Function for mock coding a CU and counting the bits --- src/cabac.c | 3 +- src/cabac.h | 4 +- src/encode_coding_tree.c | 228 +++++++++++++++++++++++++++++++-------- src/encode_coding_tree.h | 19 +++- src/rdo.c | 20 ++-- src/rdo.h | 8 +- src/search.c | 56 +++++++--- src/search_inter.c | 8 +- 8 files changed, 261 insertions(+), 85 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index 5842edbe..ed480e17 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -547,7 +547,7 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int /** * \brief */ -void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, +uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, cabac_data_t * const data, uint32_t symbol, uint32_t count) @@ -576,4 +576,5 @@ void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, } } kvz_cabac_encode_bins_ep(data, bins, num_bins); + return num_bins; } diff --git a/src/cabac.h b/src/cabac.h index 8f0c7c70..62d59d9e 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -122,7 +122,7 @@ void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol, uint32_t r_param); void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac, const uint32_t symbol, const uint32_t r_param, int32_t base_level); -void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, +uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, uint32_t symbol, uint32_t count); void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, uint32_t symbol, int32_t offset, @@ -133,7 +133,7 @@ extern const float kvz_f_entropy_bits[128]; #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(ctx)->uc_state ^ (val)] #define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ - (bits) += kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]; \ + if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(ctx)->uc_state ^ (val)]; \ if((cabac)->update) {\ (cabac)->cur_ctx = ctx;\ CABAC_BIN((cabac), (val), (name));\ diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 76f0cc7e..a847640e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -309,15 +309,17 @@ static void encode_transform_coeff(encoder_state_t * const state, } void kvz_encode_inter_prediction_unit(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int width, int height, - int depth, lcu_t* lcu) + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + int x, int y, int width, int height, + int depth, lcu_t* lcu, double* bits_out) { // Mergeflag int16_t num_cand = 0; - cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, cur_cu->merged, "MergeFlag"); + double bits = 0; + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag"); + num_cand = state->encoder_control->cfg.max_merge; if (cur_cu->merged) { //merge if (num_cand > 1) { @@ -325,10 +327,10 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != cur_cu->merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac,symbol,"MergeIndex"); + if(cabac->only_count) bits += 1; } if (symbol == 0) break; } @@ -339,12 +341,10 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, uint8_t inter_dir = cur_cu->inter.mv_dir-1; if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8) { - cabac->cur_ctx = &(cabac->ctx.inter_dir[depth]); - CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[depth]), inter_dir == 2, bits, "inter_pred_idc"); } if (inter_dir < 2) { - cabac->cur_ctx = &(cabac->ctx.inter_dir[4]); - CABAC_BIN(cabac, inter_dir, "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[4]), inter_dir, bits, "inter_pred_idc"); } } @@ -359,9 +359,8 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, if (ref_LX_size > 1) { // parseRefFrmIdx int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0) { ref_frame--; @@ -373,9 +372,10 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, if (i == 0) { cabac->cur_ctx = &cabac->ctx.cu_ref_pic_model[1]; - CABAC_BIN(cabac, symbol, "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_ref_pic_model[1], symbol, bits, "ref_idx_lX"); } else { CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); + if (cabac->only_count) bits += 1; } if (symbol == 0) break; } @@ -404,7 +404,7 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, const int32_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; const int32_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; - kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver); + kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out); } // Signal which candidate MV to use @@ -416,6 +416,7 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, } // for ref_list } // if !merge + if(bits_out) *bits_out += bits; } @@ -466,7 +467,7 @@ static INLINE uint8_t intra_mode_encryption(encoder_state_t * const state, static void encode_intra_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth) + int x, int y, int depth, double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual[4]; @@ -569,18 +570,19 @@ static void encode_intra_coding_unit(encoder_state_t * const state, } #endif } - - cabac->cur_ctx = &(cabac->ctx.intra_mode_model); + for (int j = 0; j < num_pred_units; ++j) { - CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_mode_model),flag[j], *bits_out, "prev_intra_luma_pred_flag"); } for (int j = 0; j < num_pred_units; ++j) { // Signal index of the prediction mode in the prediction list. if (flag[j]) { CABAC_BIN_EP(cabac, (mpm_preds[j] == 0 ? 0 : 1), "mpm_idx"); + if (cabac->only_count) *bits_out += 1; if (mpm_preds[j] != 0) { CABAC_BIN_EP(cabac, (mpm_preds[j] == 1 ? 0 : 1), "mpm_idx"); + if (cabac->only_count) *bits_out += 1; } } else { // Signal the actual prediction mode. @@ -599,6 +601,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state, } CABAC_BINS_EP(cabac, tmp_pred, 5, "rem_intra_luma_pred_mode"); + if (cabac->only_count) *bits_out += 5; } } @@ -639,17 +642,21 @@ static void encode_intra_coding_unit(encoder_state_t * const state, */ cabac->cur_ctx = &(cabac->ctx.chroma_pred_model[0]); if (pred_mode == 4) { - CABAC_BIN(cabac, 0, "intra_chroma_pred_mode"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.chroma_pred_model[0]), 0, *bits_out,"intra_chroma_pred_mode"); } else { - CABAC_BIN(cabac, 1, "intra_chroma_pred_mode"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.chroma_pred_model[0]), 1, *bits_out,"intra_chroma_pred_mode"); CABAC_BINS_EP(cabac, pred_mode, 2, "intra_chroma_pred_mode"); + if (cabac->only_count) *bits_out += 2; } } - encode_transform_coeff(state, x, y, depth, 0, 0, 0); + // if we are counting bits, the cost for transform coeffs is done separately + // To get the distortion at the same time + if(!cabac->only_count) + encode_transform_coeff(state, x, y, depth, 0, 0, 0); } -static void encode_part_mode(encoder_state_t * const state, +static double encode_part_mode(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int depth) @@ -684,32 +691,32 @@ static void encode_part_mode(encoder_state_t * const state, // log2CbSize == MinCbLog2SizeY | 0 1 2 bypass // log2CbSize > MinCbLog2SizeY | 0 1 3 bypass // ------------------------------+------------------ - + double bits = 0; if (cur_cu->type == CU_INTRA) { if (depth == MAX_DEPTH) { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); } else { - CABAC_BIN(cabac, 0, "part_mode NxN"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN"); } } } else { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode split"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split"); cabac->cur_ctx = &(cabac->ctx.part_size_model[1]); if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_2NxnD) { - CABAC_BIN(cabac, 1, "part_mode vertical"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical"); } else { - CABAC_BIN(cabac, 0, "part_mode horizontal"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal"); } if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) { @@ -717,19 +724,22 @@ static void encode_part_mode(encoder_state_t * const state, if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_Nx2N) { - CABAC_BIN(cabac, 1, "part_mode SMP"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode AMP"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP"); if (cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_nLx2N) { CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } else { CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } } } + return bits; } void kvz_encode_coding_tree(encoder_state_t * const state, @@ -865,7 +875,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL); + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL); } { @@ -883,7 +893,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); } #if ENABLE_PCM @@ -942,11 +952,135 @@ end: } +void kvz_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu) { + double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; + + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + + const int cu_width = LCU_WIDTH >> depth; + const int half_cu = cu_width >> 1; + + const cu_info_t* cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const cu_info_t* left_cu = NULL, *above_cu = NULL; + if (x) { + left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + if (y) { + above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1); + } + uint8_t split_model = 0; + + // Absolute coordinates + uint16_t abs_x = x + state->tile->offset_x; + uint16_t abs_y = y + state->tile->offset_y; + + // Check for slice border + bool border_x = ctrl->in.width < abs_x + cu_width; + bool border_y = ctrl->in.height < abs_y + cu_width; + bool border = border_x || border_y; /*!< are we in any border CU */ + + if (depth <= state->frame->max_qp_delta_depth) { + state->must_code_qp_delta = true; + } + + // When not in MAX_DEPTH, insert split flag and split the blocks if needed + if (depth != MAX_DEPTH) { + // Implicit split flag when on border + if (!border) { + // Get left and top block split_flags and if they are present and true, increase model number + if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { + split_model++; + } + + if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { + split_model++; + } + + // This mocks encoding the current CU so it should be never split + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), 0, bits, "SplitFlag"); + } + } + + // Encode skip flag + if (state->frame->slicetype != KVZ_SLICE_I) { + int8_t ctx_skip = 0; + + if (left_cu && left_cu->skipped) { + ctx_skip++; + } + if (above_cu && above_cu->skipped) { + ctx_skip++; + } + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag"); + + if (cur_cu->skipped) { + int16_t num_cand = state->encoder_control->cfg.max_merge; + if (num_cand > 1) { + for (int ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != cur_cu->merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); + } + else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + if(cabac->only_count) bits += 1; + } + if (symbol == 0) { + break; + } + } + } + return; + } + } + // Prediction mode + if (state->frame->slicetype != KVZ_SLICE_I) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model), (cur_cu->type == CU_INTRA), bits, "PredMode"); + } + + // part_mode + bits += encode_part_mode(state, cabac, cur_cu, depth); + + if (cur_cu->type == CU_INTER) { + const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; + + for (int i = 0; i < num_pu; ++i) { + const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); + const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); + const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); + const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); + const cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits); + } + + { + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + // Only need to signal coded block flag if not skipped or merged + // skip = no coded residual, merge = coded residual + if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, bits, "rqt_root_cbf"); + } + + } + } + else if (cur_cu->type == CU_INTRA) { + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); + } +} + void kvz_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver) + int32_t mvd_ver, double* bits_out) { const int8_t hor_abs_gr0 = mvd_hor != 0; const int8_t ver_abs_gr0 = mvd_ver != 0; @@ -954,20 +1088,21 @@ void kvz_encode_mvd(encoder_state_t * const state, const uint32_t mvd_ver_abs = abs(mvd_ver); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0]; - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver"); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1]; if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor"); } if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver"); } if (hor_abs_gr0) { if (mvd_hor_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + if(cabac->only_count) *bits_out += bits; } uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1; if (!state->cabac.only_count && @@ -976,10 +1111,12 @@ void kvz_encode_mvd(encoder_state_t * const state, mvd_hor_sign = mvd_hor_sign ^ kvz_crypto_get_key(state->crypto_hdl, 1); } CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); + if (cabac->only_count) *bits_out += 1; } if (ver_abs_gr0) { if (mvd_ver_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + if (cabac->only_count) *bits_out += bits; } uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1; if (!state->cabac.only_count && @@ -988,5 +1125,6 @@ void kvz_encode_mvd(encoder_state_t * const state, mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1); } CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); + if (cabac->only_count) *bits_out += 1; } } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index a3f95b36..b8e29358 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -49,13 +49,22 @@ void kvz_encode_coding_tree(encoder_state_t * const state, void kvz_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver); + int32_t mvd_ver, + double* bits_out); + +void kvz_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu); void kvz_encode_inter_prediction_unit(encoder_state_t* const state, - cabac_data_t* const cabac, - const cu_info_t* const cur_cu, - int x, int y, int width, int height, - int depth, lcu_t* lcu); + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int width, int height, + int depth, + lcu_t* lcu, + double* bits_out); void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, diff --git a/src/rdo.c b/src/rdo.c index 6b8960ee..5b6c3b49 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1010,22 +1010,18 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, /** * Calculate cost of actual motion vectors using CABAC coding */ -uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { cabac_data_t cabac_copy = *cabac; cabac_copy.only_count = 1; - + double bits = 0; // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver); + kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits); - uint32_t bitcost = - ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) - - ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)); - - return bitcost; + return bits; } /** MVD cost calculation with CABAC @@ -1160,7 +1156,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y); + kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, NULL); } // Signal which candidate MV to use diff --git a/src/rdo.h b/src/rdo.h index dd75fdb9..23453eee 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -71,10 +71,10 @@ uint32_t kvz_get_coded_level(encoder_state_t * state, double* coded_cost, double kvz_mvd_cost_func kvz_calc_mvd_cost_cabac; -uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - int32_t mvd_hor, - int32_t mvd_ver); +double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + int32_t mvd_hor, + int32_t mvd_ver); // Number of fixed point fractional bits used in the fractional bit table. #define CTX_FRAC_BITS 15 diff --git a/src/search.c b/src/search.c index 43a07d4b..ad24b501 100644 --- a/src/search.c +++ b/src/search.c @@ -740,29 +740,61 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { double bits = 0; - state->search_cabac.update = 1; + cabac_data_t* cabac = &state->search_cabac; + cabac->update = 1; if(depth < MAX_DEPTH) { uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); + cabac_ctx_t* ctx = &(cabac->ctx.split_flag_model[split_model]); + CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); } else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { // Add cost of intra part_size. - cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); + cabac_ctx_t* ctx = &(cabac->ctx.part_size_model[0]); + CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); } - double mode_bits; + double mode_bits = 0; + if (state->frame->slicetype != KVZ_SLICE_I) { + int ctx_skip = 0; + if (x > 0) { + ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->skipped; + } + if (y > 0) { + ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->skipped; + } + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, mode_bits, "skip_flag"); + if (cur_cu->skipped) { + int16_t num_cand = state->encoder_control->cfg.max_merge; + if (num_cand > 1) { + for (int ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != cur_cu->merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, mode_bits, "MergeIndex"); + } + else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + mode_bits += 1; + } + if (symbol == 0) { + break; + } + } + } + } + + } if (cur_cu->type == CU_INTRA) { if(state->frame->slicetype != KVZ_SLICE_I) { - cabac_ctx_t* ctx = &(state->search_cabac.ctx.cu_pred_mode_model); - CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, bits, "pred_mode_flag"); + cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "pred_mode_flag"); } - mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y); + mode_bits += calc_mode_bits(state, lcu, cur_cu, x, y); } - else { - mode_bits = inter_bitcost; + else if (!cur_cu->skipped) { + cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "pred_mode_flag"); + mode_bits += inter_bitcost; } bits += mode_bits; cost = mode_bits * state->lambda; @@ -795,7 +827,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->cbf = 0; lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } - state->search_cabac.update = 0; + cabac->update = 0; } bool can_split_cu = diff --git a/src/search_inter.c b/src/search_inter.c index 983ffcc8..a4c75d9e 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -323,19 +323,19 @@ static void select_starting_point(inter_search_info_t *info, } -static uint32_t get_mvd_coding_cost(const encoder_state_t *state, +static double get_mvd_coding_cost(const encoder_state_t *state, const cabac_data_t* cabac, const int32_t mvd_hor, const int32_t mvd_ver) { - unsigned bitcost = 0; + double bitcost = 0; const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; // Round and shift back to integer bits. - return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; + return bitcost / (1 << CTX_FRAC_BITS); } @@ -353,7 +353,7 @@ static int select_mv_cand(const encoder_state_t *state, return 0; } - uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, + double (*mvd_coding_cost)(const encoder_state_t * const state, const cabac_data_t*, int32_t, int32_t); if (state->encoder_control->cfg.mv_rdo) { From 64b2806cc818f029b6387b5e868b79b06fb1cd6e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 11:26:45 +0200 Subject: [PATCH 055/135] Add couple of missing bits to the calculation and get intra neighbours from lcu rather than cu_array --- src/cabac.c | 18 ++++++----- src/cabac.h | 4 +-- src/encode_coding_tree.c | 27 ++++++++--------- src/encode_coding_tree.h | 4 +-- src/rdo.c | 15 +++++----- src/search.c | 65 ++++++++-------------------------------- 6 files changed, 47 insertions(+), 86 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index ed480e17..36931277 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -491,26 +491,28 @@ void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, ca /** * \brief */ -void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol) +void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, + cabac_ctx_t * const ctx, + uint32_t symbol, + const int32_t offset, + const uint32_t max_symbol, + double* bits_out) { int8_t code_last = max_symbol > symbol; assert(symbol <= max_symbol); if (!max_symbol) return; - - data->cur_ctx = &ctx[0]; - CABAC_BIN(data, symbol, "ums"); + + CABAC_FBITS_UPDATE(data, &ctx[0], symbol, *bits_out, "ums"); if (!symbol) return; while (--symbol) { - data->cur_ctx = &ctx[offset]; - CABAC_BIN(data, 1, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums"); } if (code_last) { - data->cur_ctx = &ctx[offset]; - CABAC_BIN(data, 0, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums"); } } diff --git a/src/cabac.h b/src/cabac.h index 62d59d9e..f9190045 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -125,8 +125,8 @@ void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, ca uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, uint32_t symbol, uint32_t count); void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, - uint32_t symbol, int32_t offset, - uint32_t max_symbol); + uint32_t symbol, int32_t offset, + uint32_t max_symbol, double* bits_out); void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol); extern const float kvz_f_entropy_bits[128]; diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index a847640e..b25494f4 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -290,7 +290,7 @@ static void encode_transform_coeff(encoder_state_t * const state, // cu_qp_delta_abs prefix cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0]; - kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5); + kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL); if (qp_delta_abs >= 5) { // cu_qp_delta_abs suffix @@ -412,7 +412,7 @@ void kvz_encode_inter_prediction_unit(encoder_state_t * const state, cabac->ctx.mvp_idx_model, CU_GET_MV_CAND(cur_cu, ref_list_idx), 1, - AMVP_MAX_NUM_CANDS - 1); + AMVP_MAX_NUM_CANDS - 1, bits_out); } // for ref_list } // if !merge @@ -467,7 +467,7 @@ static INLINE uint8_t intra_mode_encryption(encoder_state_t * const state, static void encode_intra_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth, double* bits_out) + int x, int y, int depth, lcu_t* lcu, double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual[4]; @@ -506,19 +506,19 @@ static void encode_intra_coding_unit(encoder_state_t * const state, for (int j = 0; j < num_pred_units; ++j) { const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j); const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j); - const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); + const cu_info_t *cur_pu = lcu ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)) : kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); const cu_info_t *left_pu = NULL; const cu_info_t *above_pu = NULL; if (pu_x > 0) { assert(pu_x >> 2 > 0); - left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y); + left_pu = lcu ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x -1), SUB_SCU(pu_y)) : kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y); } // Don't take the above PU across the LCU boundary. if (pu_y % LCU_WIDTH > 0 && pu_y > 0) { assert(pu_y >> 2 > 0); - above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1); + above_pu = lcu ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y - 1)) : kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1); } if (do_crypto) { @@ -893,7 +893,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); } #if ENABLE_PCM @@ -952,11 +952,11 @@ end: } -void kvz_mock_encode_coding_unit( +double kvz_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, int x, int y, int depth, - lcu_t* lcu) { + lcu_t* lcu, cu_info_t* cur_cu) { double bits = 0; const encoder_control_t* const ctrl = state->encoder_control; @@ -964,9 +964,7 @@ void kvz_mock_encode_coding_unit( int y_local = SUB_SCU(y); const int cu_width = LCU_WIDTH >> depth; - const int half_cu = cu_width >> 1; - - const cu_info_t* cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const cu_info_t* left_cu = NULL, *above_cu = NULL; if (x) { left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); @@ -1037,7 +1035,7 @@ void kvz_mock_encode_coding_unit( } } } - return; + return bits; } } // Prediction mode @@ -1072,8 +1070,9 @@ void kvz_mock_encode_coding_unit( } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); } + return bits; } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index b8e29358..42a1a981 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -52,11 +52,11 @@ void kvz_encode_mvd(encoder_state_t * const state, int32_t mvd_ver, double* bits_out); -void kvz_mock_encode_coding_unit( +double kvz_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, int x, int y, int depth, - lcu_t* lcu); + lcu_t* lcu, cu_info_t* cur_cu); void kvz_encode_inter_prediction_unit(encoder_state_t* const state, cabac_data_t* const cabac, diff --git a/src/rdo.c b/src/rdo.c index 5b6c3b49..fc0b2198 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1081,8 +1081,8 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, x - mv_cand[1][0], y - mv_cand[1][1], }; - uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); - uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); + double cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); + double cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { @@ -1161,11 +1161,12 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // Signal which candidate MV to use kvz_cabac_write_unary_max_symbol( - cabac, - cabac->ctx.mvp_idx_model, - cur_mv_cand, - 1, - AMVP_MAX_NUM_CANDS - 1); + cabac, + cabac->ctx.mvp_idx_model, + cur_mv_cand, + 1, + AMVP_MAX_NUM_CANDS - 1, + NULL); } } } diff --git a/src/search.c b/src/search.c index ad24b501..1fc36566 100644 --- a/src/search.c +++ b/src/search.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "imagelist.h" #include "inter.h" #include "intra.h" @@ -743,61 +744,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; - if(depth < MAX_DEPTH) { - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(cabac->ctx.split_flag_model[split_model]); - CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); + if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) { + bits += kvz_mock_encode_coding_unit( + state, + cabac, + x, y, depth, + lcu, + cur_cu); } - else if(depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { - // Add cost of intra part_size. - cabac_ctx_t* ctx = &(cabac->ctx.part_size_model[0]); - CABAC_FBITS_UPDATE(cabac, ctx, 0, bits, "no_split_search"); + else { + // Intra 4×4 PUs } - - double mode_bits = 0; - if (state->frame->slicetype != KVZ_SLICE_I) { - int ctx_skip = 0; - if (x > 0) { - ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->skipped; - } - if (y > 0) { - ctx_skip += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->skipped; - } - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, mode_bits, "skip_flag"); - if (cur_cu->skipped) { - int16_t num_cand = state->encoder_control->cfg.max_merge; - if (num_cand > 1) { - for (int ui = 0; ui < num_cand - 1; ui++) { - int32_t symbol = (ui != cur_cu->merge_idx); - if (ui == 0) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, mode_bits, "MergeIndex"); - } - else { - CABAC_BIN_EP(cabac, symbol, "MergeIndex"); - mode_bits += 1; - } - if (symbol == 0) { - break; - } - } - } - } - - } - if (cur_cu->type == CU_INTRA) { - if(state->frame->slicetype != KVZ_SLICE_I) { - cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); - CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "pred_mode_flag"); - } - mode_bits += calc_mode_bits(state, lcu, cur_cu, x, y); - } - else if (!cur_cu->skipped) { - cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); - CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "pred_mode_flag"); - mode_bits += inter_bitcost; - } - bits += mode_bits; - cost = mode_bits * state->lambda; + + cost = bits * state->lambda; cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); if (state->encoder_control->chroma_format != KVZ_CSP_400) { From 951a845f086232c5114b11244c183065edeab87d Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 11:48:59 +0200 Subject: [PATCH 056/135] Remove consts and fix wrong types --- src/rdo.c | 5 ++--- src/sao.c | 14 +++++++------- src/search_intra.c | 12 ++++++------ 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index fc0b2198..04b9aca9 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -271,8 +271,7 @@ static INLINE uint32_t get_coeff_cabac_cost( scan_mode, 0); if(cabac_copy.update) { - - memcpy(&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); + memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); } return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3); } @@ -1036,7 +1035,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost) + double* bitcost) { cabac_data_t state_cabac_copy; cabac_data_t* cabac; diff --git a/src/sao.c b/src/sao.c index 179f4311..b7d76e64 100644 --- a/src/sao.c +++ b/src/sao.c @@ -53,7 +53,7 @@ static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t { double mode_bits = 0.0; cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); @@ -74,8 +74,8 @@ static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t static double sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { double mode_bits = 0.0; - cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); @@ -91,8 +91,8 @@ static double sao_mode_bits_edge(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { double mode_bits = 0.0; - cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); @@ -132,8 +132,8 @@ static double sao_mode_bits_band(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { double mode_bits = 0.0; - cabac_data_t * const cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = NULL; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); diff --git a/src/search_intra.c b/src/search_intra.c index e29f29a3..2986f67f 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -269,7 +269,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // Add bits for split_transform_flag = 1, because transform depth search bypasses // the normal recursion in the cost functions. if (depth >= 1 && depth <= 3) { - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split"); *bit_cost += tr_split_bit; } @@ -283,7 +283,7 @@ static double search_intra_trdepth(encoder_state_t * const state, if (state->encoder_control->chroma_format != KVZ_CSP_400) { const uint8_t tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); + cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); } @@ -647,9 +647,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } -double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) +double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds) { - cabac_data_t* cabac = &state->search_cabac; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; double mode_bits = 0; bool mode_in_preds = false; @@ -659,7 +659,7 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t } } - const cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); + cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model); CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search"); if (state->search_cabac.update) { if(mode_in_preds) { @@ -688,7 +688,7 @@ double kvz_luma_mode_bits(encoder_state_t *state, int8_t luma_mode, const int8_t double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; - const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); + cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model[0]); double mode_bits = 0; CABAC_FBITS_UPDATE(cabac, ctx, chroma_mode != luma_mode, mode_bits, "intra_chroma_pred_mode"); From a038ccc19ae74b4287204b6d9e95a3847a8e0b42 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 16 Dec 2021 13:16:48 +0200 Subject: [PATCH 057/135] =?UTF-8?q?add=20back=20bitcost=20for=204=C3=974?= =?UTF-8?q?=20intra=20PUs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/search.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/search.c b/src/search.c index 1fc36566..def91fcb 100644 --- a/src/search.c +++ b/src/search.c @@ -754,6 +754,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else { // Intra 4×4 PUs + if (state->frame->slicetype != KVZ_SLICE_I) { + cabac_ctx_t* ctx = &(cabac->ctx.cu_pred_mode_model); + CABAC_FBITS_UPDATE(cabac, ctx, 1, bits, "pred_mode_flag"); + } + bits += calc_mode_bits(state, lcu, cur_cu, x, y); } cost = bits * state->lambda; From 243e45f07e9696f9ee5515be385d1af668bcf9d3 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 20 Dec 2021 09:36:23 +0200 Subject: [PATCH 058/135] accurate bit cost calculation when using transform skip --- src/cabac.c | 6 +- src/cabac.h | 1 - src/encoderstate.c | 2 + src/search.c | 171 +++++++++++++++++++++++++++++++++++++++------ 4 files changed, 153 insertions(+), 27 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index 36931277..ae31fb0b 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -106,8 +106,8 @@ void kvz_cabac_start(cabac_data_t * const data) void kvz_cabac_encode_bin(cabac_data_t * const data, const uint32_t bin_value) { uint32_t lps; - - + + if (!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (bin_value)); lps = kvz_g_auc_lpst_table[CTX_STATE(data->cur_ctx)][(data->range >> 6) & 3]; data->range -= lps; @@ -577,6 +577,6 @@ uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, bins = ( (bins >> (num_bins >>1) ) << (num_bins >>1) ) | state->crypto_prev_pos; } } - kvz_cabac_encode_bins_ep(data, bins, num_bins); + CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb"); return num_bins; } diff --git a/src/cabac.h b/src/cabac.h index f9190045..6c46011b 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -156,7 +156,6 @@ extern double bits_written; #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ - if(!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (value));\ kvz_cabac_encode_bin((data), (value)); \ if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u bits = %f\n", \ (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx), bits_written); } diff --git a/src/encoderstate.c b/src/encoderstate.c index 012476df..d02ca483 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -685,6 +685,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) const uint64_t existing_bits = kvz_bitstream_tell(&state->stream); //Encode SAO + state->cabac.update = 1; if (encoder->cfg.sao_type) { encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]); } @@ -737,6 +738,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) kvz_crypto_delete(&state->crypto_hdl); } } + state->cabac.update = 0; pthread_mutex_lock(&state->frame->rc_lock); const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits; diff --git a/src/search.c b/src/search.c index def91fcb..461eae4e 100644 --- a/src/search.c +++ b/src/search.c @@ -299,30 +299,34 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, return sum + tr_tree_bits * state->lambda; } + + if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { + // Because these need to be coded before the luma cbf they also need to be counted + // before the cabac state changes. However, since this branch is only executed when + // calculating the last RD cost it is not problem to include the chroma cbf costs in + // luma, because the chroma cost is calculated right after the luma cost. + // However, if we have different tr_depth, the bits cannot be written in correct + // order anyways so do not touch the chroma cbf here. + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]); + cabac->cur_ctx = cr_ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); + } + } + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = tr_cu->tr_depth - tr_cu->depth; if (pred_cu->type == CU_INTRA || - tr_depth > 0 || + is_tr_split || cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!tr_depth]); + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { - // Because these need to be coded before the luma cbf they also need to be counted - // before the cabac state changes. However, since this branch is only executed when - // calculating the last RD cost it is not problem to include the chroma cbf costs in - // luma, because the chroma cost is calculated right after the luma cost. - // However, if we have different tr_depth, the bits cannot be written in correct - // order anyways so do not touch the chroma cbf here. - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); - cabac->cur_ctx = cr_ctx; - int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); - int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); - CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); - CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); - } - } + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); *bit_cost += tr_tree_bits; } @@ -390,7 +394,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); - int sum = 0; + double sum = 0; sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); @@ -426,6 +430,126 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, return (double)ssd * CHROMA_MULT + bits * state->lambda; } +static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, + const int x_px, const int y_px, const int depth, + const cu_info_t* const pred_cu, + lcu_t* const lcu, + double* bit_cost) { + const int width = LCU_WIDTH >> depth; + + // cur_cu is used for TU parameters. + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + + double coeff_bits = 0; + double tr_tree_bits = 0; + + // Check that lcu is not in + assert(x_px >= 0 && x_px < LCU_WIDTH); + assert(y_px >= 0 && y_px < LCU_WIDTH); + + const uint8_t tr_depth = tr_cu->tr_depth - depth; + + const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); + + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + + // Add transform_tree split_transform_flag bit cost. + bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; + int max_tr_depth; + if (pred_cu->type == CU_INTRA) { + max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag; + } + else { + max_tr_depth = state->encoder_control->tr_depth_inter; + } + if (width <= TR_MAX_WIDTH + && width > TR_MIN_WIDTH + && !intra_split_flag + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) + { + cabac_ctx_t* ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); + CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); + } + + if(state->encoder_control->chroma_format != KVZ_CSP_400) { + if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_U)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_u, tr_tree_bits, "cbf_cb"); + } + if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_V)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_v, tr_tree_bits, "cbf_cr"); + } + } + + if (tr_depth > 0) { + int offset = LCU_WIDTH >> (depth + 1); + double sum = 0; + *bit_cost += tr_tree_bits; + + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + return sum + tr_tree_bits * state->lambda; + } + const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ; + + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = depth - tr_cu->depth; + if (pred_cu->type == CU_INTRA || + is_tr_split || + cb_flag_u || + cb_flag_v) + { + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); + + CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); + } + *bit_cost += tr_tree_bits; + // SSD between reconstruction and original + unsigned luma_ssd = 0; + if (!state->encoder_control->cfg.lossless) { + int index = y_px * LCU_WIDTH + x_px; + luma_ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width); + } + + { + int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); + const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + + coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode); + } + + unsigned chroma_ssd = 0; + if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) { + const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; + const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + if (!state->encoder_control->cfg.lossless) { + int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + chroma_ssd = ssd_u + ssd_v; + } + + { + int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order); + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order); + } + } + *bit_cost += coeff_bits; + double bits = tr_tree_bits + coeff_bits; + return luma_ssd * LUMA_MULT + chroma_ssd * CHROMA_MULT + bits * state->lambda; +} + // Return estimate of bits used to code prediction mode of cur_cu. static double calc_mode_bits(const encoder_state_t *state, @@ -763,10 +887,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost = bits * state->lambda; - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); - } + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, &bits); + //if (state->encoder_control->chroma_format != KVZ_CSP_400) { + // cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); + //} FILE_BITS(bits, x, y, depth, "final rd bits"); @@ -826,6 +950,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } + FILE_BITS(split_bits, x, y, depth, "split"); state->search_cabac.update = 0; split_cost += split_bits * state->lambda; From f83e21735ce602f2672cc4fa51a2aaf8b8294e92 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 20 Dec 2021 10:44:19 +0200 Subject: [PATCH 059/135] Fix couple of mistakes --- src/search.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/search.c b/src/search.c index 461eae4e..2cf9dae6 100644 --- a/src/search.c +++ b/src/search.c @@ -437,6 +437,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, double* bit_cost) { const int width = LCU_WIDTH >> depth; + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); // cur_cu is used for TU parameters. cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -466,17 +467,18 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag - && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth + && !skip_residual_coding) { cabac_ctx_t* ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); } - if(state->encoder_control->chroma_format != KVZ_CSP_400) { - if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_U)) { + if(state->encoder_control->chroma_format != KVZ_CSP_400 && !skip_residual_coding) { + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_u, tr_tree_bits, "cbf_cb"); } - if(tr_cu->depth == depth || cbf_is_set(tr_cu->cbf, depth - 1, COLOR_V)) { + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_chroma[depth - tr_cu->depth]), cb_flag_v, tr_tree_bits, "cbf_cr"); } } @@ -496,10 +498,11 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, // Add transform_tree cbf_luma bit cost. const int is_tr_split = depth - tr_cu->depth; - if (pred_cu->type == CU_INTRA || + if ((pred_cu->type == CU_INTRA || is_tr_split || cb_flag_u || - cb_flag_v) + cb_flag_v) + && !skip_residual_coding) { cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); From 5ed1ffb5d4b1e4036ebc45736c4195edfcd53711 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 21 Dec 2021 17:04:47 +0200 Subject: [PATCH 060/135] WIP: pre Christmas --- src/encode_coding_tree.c | 6 +++--- src/encode_coding_tree.h | 5 +++++ src/rdo.c | 28 +++++++++++++--------------- src/search.c | 2 +- src/search_inter.c | 26 +++++++++++++------------- 5 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index b25494f4..ffd8ae1e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -656,7 +656,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state, encode_transform_coeff(state, x, y, depth, 0, 0, 0); } -static double encode_part_mode(encoder_state_t * const state, +double kvz_encode_part_mode(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int depth) @@ -863,7 +863,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } // part_mode - encode_part_mode(state, cabac, cur_cu, depth); + kvz_encode_part_mode(state, cabac, cur_cu, depth); if (cur_cu->type == CU_INTER) { const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; @@ -1044,7 +1044,7 @@ double kvz_mock_encode_coding_unit( } // part_mode - bits += encode_part_mode(state, cabac, cur_cu, depth); + bits += kvz_encode_part_mode(state, cabac, cur_cu, depth); if (cur_cu->type == CU_INTER) { const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 42a1a981..d189e6e0 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -58,6 +58,11 @@ double kvz_mock_encode_coding_unit( int x, int y, int depth, lcu_t* lcu, cu_info_t* cur_cu); +double kvz_encode_part_mode(encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int depth); + void kvz_encode_inter_prediction_unit(encoder_state_t* const state, cabac_data_t* const cabac, const cu_info_t* const cur_cu, diff --git a/src/rdo.c b/src/rdo.c index 04b9aca9..e8805494 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1062,14 +1062,13 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, } // Store cabac state and contexts - memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t)); + memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t)); // Clear bytes and bits and set mode to "count" state_cabac_copy.only_count = 1; - state_cabac_copy.num_buffered_bytes = 0; - state_cabac_copy.bits_left = 23; cabac = &state_cabac_copy; + double bits = 0; if (!merged) { vector2d_t mvd1 = { @@ -1094,7 +1093,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, merged, "MergeFlag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag"); num_cand = state->encoder_control->cfg.max_merge; if (merged) { if (num_cand > 1) { @@ -1102,10 +1101,10 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + bits += 1; } if (symbol == 0) break; } @@ -1128,24 +1127,23 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, if (ref_list[ref_list_idx] > 1) { // parseRefFrmIdx int32_t ref_frame = ref_idx; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0) { int32_t i; int32_t ref_num = ref_list[ref_list_idx] - 2; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); + ref_frame--; for (i = 0; i < ref_num; ++i) { const uint32_t symbol = (i == ref_frame) ? 0 : 1; if (i == 0) { - CABAC_BIN(cabac, symbol, "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX"); } else { CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); + bits += 1; } if (symbol == 0) break; } @@ -1155,7 +1153,7 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, NULL); + kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits); } // Signal which candidate MV to use @@ -1165,12 +1163,12 @@ double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, cur_mv_cand, 1, AMVP_MAX_NUM_CANDS - 1, - NULL); + &bits); } } } - *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); + *bitcost = bits; // Store bitcost before restoring cabac return *bitcost * state->lambda_sqrt; diff --git a/src/search.c b/src/search.c index 2cf9dae6..ff116140 100644 --- a/src/search.c +++ b/src/search.c @@ -676,7 +676,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // Assign correct depth limit constraint_t* constr = state->constraint; - if(constr->ml_intra_depth_ctu) { + if(constr->ml_intra_depth_ctu) { pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8]; pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8]; } diff --git a/src/search_inter.c b/src/search_inter.c index a4c75d9e..93ef2333 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "image.h" #include "imagelist.h" #include "inter.h" @@ -343,7 +344,7 @@ static int select_mv_cand(const encoder_state_t *state, int16_t mv_cand[2][2], int32_t mv_x, int32_t mv_y, - uint32_t *cost_out) + double*cost_out) { const bool same_cand = (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]); @@ -362,12 +363,12 @@ static int select_mv_cand(const encoder_state_t *state, mvd_coding_cost = get_mvd_coding_cost; } - uint32_t cand1_cost = mvd_coding_cost( + double cand1_cost = mvd_coding_cost( state, &state->cabac, mv_x - mv_cand[0][0], mv_y - mv_cand[0][1]); - uint32_t cand2_cost; + double cand2_cost; if (same_cand) { cand2_cost = cand1_cost; } else { @@ -419,7 +420,7 @@ static double calc_mvd_cost(const encoder_state_t *state, // Check mvd cost only if mv is not merged if (!merged) { - uint32_t mvd_cost = 0; + double mvd_cost = 0; select_mv_cand(state, mv_cand, x, y, &mvd_cost); temp_bitcost += mvd_cost; } @@ -2165,7 +2166,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost = 0; *inter_bitcost = 0; - + for (int i = 0; i < num_pu; ++i) { const int x_pu = PU_GET_X(part_mode, width, x_local, i); const int y_pu = PU_GET_Y(part_mode, width, y_local, i); @@ -2233,14 +2234,13 @@ void kvz_search_cu_smp(encoder_state_t * const state, } } - // Count bits spent for coding the partition mode. - int smp_extra_bits = 1; // horizontal or vertical - if (state->encoder_control->cfg.amp_enable) { - smp_extra_bits += 1; // symmetric or asymmetric - if (part_mode != SIZE_2NxN && part_mode != SIZE_Nx2N) { - smp_extra_bits += 1; // U,L or D,R - } - } + double smp_extra_bits = kvz_encode_part_mode( + state, + &state->search_cabac, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), + depth + ); + // The transform is split for SMP and AMP blocks so we need more bits for // coding the CBF. smp_extra_bits += 6; From 8d12884e4318c39e99add6fae93fd56d88c1e5de Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 5 Jan 2022 11:14:44 +0200 Subject: [PATCH 061/135] disable VERBOSE --- src/global.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/global.h b/src/global.h index 9a2ee989..2ad0830b 100644 --- a/src/global.h +++ b/src/global.h @@ -110,7 +110,7 @@ typedef int16_t coeff_t; -#define VERBOSE 1 +// #define VERBOSE 1 /* CONFIG VARIABLES */ From 159793f5b4f028b23df2fde7a61dab129df4c5d2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 6 Jan 2022 09:12:03 +0200 Subject: [PATCH 062/135] more accurate get_mvd_coding_cost --- src/search_inter.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 93ef2333..fcd64ba2 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -247,10 +247,10 @@ static bool check_mv_cost(inter_search_info_t *info, static unsigned get_ep_ex_golomb_bitcost(unsigned symbol) { - // Calculate 2 * log2(symbol + 2) + // Calculate 2 * log2(symbol ) unsigned bins = 0; - symbol += 2; + symbol += 0; if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; } if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; } if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; } @@ -324,19 +324,21 @@ static void select_starting_point(inter_search_info_t *info, } -static double get_mvd_coding_cost(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +static double get_mvd_coding_cost(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { - double bitcost = 0; + double bitcost = 4 << CTX_FRAC_BITS; const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; + bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; // Round and shift back to integer bits. - return bitcost / (1 << CTX_FRAC_BITS); + return bitcost / (1 << CTX_FRAC_BITS); } From aaac260438c336e40f3e52f212fdf82feb2600d9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 27 Jan 2022 13:35:47 +0200 Subject: [PATCH 063/135] better merge cost --- src/search_inter.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index fcd64ba2..f56998b7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1641,6 +1641,7 @@ static void search_pu_inter(encoder_state_t * const state, merge->cost[i] = MAX_DOUBLE; } + const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { @@ -1678,8 +1679,9 @@ static void search_pu_inter(encoder_state_t * const state, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); // Add cost of coding the merge index - merge->cost[merge->size] += merge_idx * info->state->lambda_sqrt; - merge->bits[merge->size] = merge_idx; + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + merge->cost[merge->size] += bits * info->state->lambda_sqrt; + merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; merge->unit[merge->size] = *cur_pu; @@ -2013,7 +2015,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, lcu, false); - double bits; + double bits = 0; *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); if (reconstruct_chroma) { *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); From f3f0037123bc4eb85e99fef0314118c0dfe3d672 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 27 Jan 2022 13:41:19 +0200 Subject: [PATCH 064/135] include root_cbf cost --- src/search_inter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index f56998b7..9bda59b4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2016,9 +2016,14 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, false); double bits = 0; - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); - if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + *inter_bitcost += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, !!cbf); + + if(cbf) { + *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + if (reconstruct_chroma) { + *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + } } FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); From 5afd3570f6194a0c4c733793b319e6ac2d6e8071 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 08:14:57 +0200 Subject: [PATCH 065/135] Update cu_qt_root_cbf_model --- src/search.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/search.c b/src/search.c index ff116140..246583d1 100644 --- a/src/search.c +++ b/src/search.c @@ -454,6 +454,9 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + if(pred_cu->type == CU_INTER && !pred_cu->skipped && depth == pred_cu->depth) { + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_qt_root_cbf_model, cbf_is_set_any(pred_cu->cbf, depth), tr_tree_bits, "root_cbf"); + } // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; From 1a9e54601fc68dd461cb1e508fc78651ab9f2622 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 09:08:25 +0200 Subject: [PATCH 066/135] Calculate rd2 cost for all inter modes instead of just the final one --- src/search_inter.c | 36 ++++++++++++++++++++++-------------- src/search_inter.h | 5 +++++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 9bda59b4..dadd3df7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1673,13 +1673,17 @@ static void search_pu_inter(encoder_state_t * const state, } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - - merge->cost[merge->size] = kvz_satd_any_size(width, height, - lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, - lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); - - // Add cost of coding the merge index + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + if(state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &merge->cost[merge->size], &bits); + } + else { + merge->cost[merge->size] = kvz_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + } + // Add cost of coding the merge index merge->cost[merge->size] += bits * info->state->lambda_sqrt; merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; @@ -1769,6 +1773,10 @@ static void search_pu_inter(encoder_state_t * const state, amvp[0].size > 0 ? amvp[0].keys[0] : 0, amvp[1].size > 0 ? amvp[1].keys[0] : 0 }; + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + } cu_info_t *best_unipred[2] = { &amvp[0].unit[best_keys[0]], @@ -1850,6 +1858,10 @@ static void search_pu_inter(encoder_state_t * const state, unipred_pu->inter.mv[list][1] = frac_mv.y; CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &frac_cost, &frac_bits); + } + amvp[list].cost[key] = frac_cost; amvp[list].bits[key] = frac_bits; } @@ -1919,6 +1931,7 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; + best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); @@ -1971,6 +1984,9 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); + } } } @@ -2115,14 +2131,6 @@ void kvz_search_cu_inter(encoder_state_t * const state, cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; - // Calculate more accurate cost when needed - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); - } if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); diff --git a/src/search_inter.h b/src/search_inter.h index bb9067c5..da547d90 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -92,5 +92,10 @@ unsigned kvz_inter_satd_cost(const encoder_state_t* state, const lcu_t *lcu, int x, int y); +void kvz_cu_cost_inter_rd2(encoder_state_t* const state, + int x, int y, int depth, + lcu_t* lcu, + double* inter_cost, + double* inter_bitcost); #endif // SEARCH_INTER_H_ From 6d73db5a2a44e1caf3bcc217dea36a631e8756af Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 12:26:12 +0200 Subject: [PATCH 067/135] Probably correct RD cost calculation for all inter modes --- src/search.c | 33 ++++++------ src/search.h | 9 ++++ src/search_inter.c | 123 +++++++++++++++++++++++++++++++++++++-------- src/search_inter.h | 3 ++ 4 files changed, 130 insertions(+), 38 deletions(-) diff --git a/src/search.c b/src/search.c index 246583d1..185e89fa 100644 --- a/src/search.c +++ b/src/search.c @@ -60,14 +60,6 @@ // Cost threshold for doing intra search in inter frames with --rd=0. static const int INTRA_THRESHOLD = 8; -// Modify weight of luma SSD. -#ifndef LUMA_MULT -# define LUMA_MULT 0.8 -#endif -// Modify weight of chroma SSD. -#ifndef CHROMA_MULT -# define CHROMA_MULT 1.5 -#endif static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to) { @@ -216,16 +208,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); double ssd = 0.0; - ssd += LUMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_LUMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], LCU_WIDTH, LCU_WIDTH, cu_width ); if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) { - ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); - ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); @@ -253,6 +245,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, double *bit_cost) { const int width = LCU_WIDTH >> depth; + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); // cur_cu is used for TU parameters. cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -280,7 +273,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag - && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth) + && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth + && !skip_residual_coding) { cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); @@ -300,7 +294,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, } - if (cabac->update && tr_cu->tr_depth == tr_cu->depth) { + if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) { // Because these need to be coded before the luma cbf they also need to be counted // before the cabac state changes. However, since this branch is only executed when // calculating the last RD cost it is not problem to include the chroma cbf costs in @@ -340,7 +334,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, width); } - { + + if (!skip_residual_coding) { int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; @@ -349,7 +344,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * LUMA_MULT + bits * state->lambda; + return (double)ssd * KVZ_LUMA_MULT + bits * state->lambda; } @@ -362,6 +357,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); double tr_tree_bits = 0; double coeff_bits = 0; @@ -376,7 +372,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } // See luma for why the second condition - if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth)) { + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) { const int tr_depth = depth - pred_cu->depth; cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chroma[tr_depth]); @@ -417,6 +413,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, ssd = ssd_u + ssd_v; } + if (!skip_residual_coding) { int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); @@ -427,7 +424,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * CHROMA_MULT + bits * state->lambda; + return (double)ssd * KVZ_CHROMA_MULT + bits * state->lambda; } static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, @@ -553,7 +550,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, } *bit_cost += coeff_bits; double bits = tr_tree_bits + coeff_bits; - return luma_ssd * LUMA_MULT + chroma_ssd * CHROMA_MULT + bits * state->lambda; + return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda; } diff --git a/src/search.h b/src/search.h index b11a0ad5..bcd517cb 100644 --- a/src/search.h +++ b/src/search.h @@ -46,6 +46,15 @@ #define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS) + // Modify weight of luma SSD. +#ifndef KVZ_LUMA_MULT +# define KVZ_LUMA_MULT 0.8 +#endif +// Modify weight of chroma SSD. +#ifndef KVZ_CHROMA_MULT +# define KVZ_CHROMA_MULT 1.5 +#endif + /** * \brief Data collected during search processes. * diff --git a/src/search_inter.c b/src/search_inter.c index dadd3df7..d1a031ac 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1160,6 +1160,30 @@ static void search_frac(inter_search_info_t *info, *best_bits = bitcost; } +int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a) { + assert(!(lcu && cu_a)); + int context = 0; + if(lcu) { + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + if (x) { + context += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->skipped; + } + if (y) { + context += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->skipped; + } + } + else { + if (x > 0) { + context += kvz_cu_array_at_const(cu_a, x - 1, y)->skipped; + } + if (y > 0) { + context += kvz_cu_array_at_const(cu_a, x, y - 1)->skipped; + } + } + return context; +} + /** * \brief Calculate the scaled MV */ @@ -1676,7 +1700,7 @@ static void search_pu_inter(encoder_state_t * const state, double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); if(state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &merge->cost[merge->size], &bits); + kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); } else { merge->cost[merge->size] = kvz_satd_any_size(width, height, @@ -1773,10 +1797,6 @@ static void search_pu_inter(encoder_state_t * const state, amvp[0].size > 0 ? amvp[0].keys[0] : 0, amvp[1].size > 0 ? amvp[1].keys[0] : 0 }; - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); - } cu_info_t *best_unipred[2] = { &amvp[0].unit[best_keys[0]], @@ -1808,6 +1828,11 @@ static void search_pu_inter(encoder_state_t * const state, } } + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + } + // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { @@ -1859,7 +1884,7 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &frac_cost, &frac_bits); + kvz_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); } amvp[list].cost[key] = frac_cost; @@ -1985,7 +2010,7 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } @@ -2012,39 +2037,96 @@ static void search_pu_inter(encoder_state_t * const state, */ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, int x, int y, int depth, + cu_info_t* cur_cu, lcu_t *lcu, double *inter_cost, double* inter_bitcost){ - - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + int tr_depth = MAX(1, depth); if (cur_cu->part_size != SIZE_2Nx2N) { tr_depth = depth + 1; } kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth); + const int x_px = SUB_SCU(x); + const int y_px = SUB_SCU(y); + const int width = LCU_WIDTH >> depth; + const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); - kvz_quantize_lcu_residual(state, true, reconstruct_chroma, - x, y, depth, - NULL, - lcu, - false); + int index = y_px * LCU_WIDTH + x_px; + double ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width) * KVZ_LUMA_MULT; + if (reconstruct_chroma) { + int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; + double ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + double ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + ssd += ssd_u + ssd_v; + ssd *= KVZ_CHROMA_MULT; + } + double no_cbf_bits; double bits = 0; - int cbf = cbf_is_set_any(cur_cu->cbf, depth); - *inter_bitcost += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, !!cbf); + int skip_context = kvz_get_skip_context(x, y, lcu, NULL); + if (cur_cu->merged) { + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); + } + else { + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0); + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1); + } + double no_cbf_cost = ssd + (no_cbf_bits + *inter_bitcost) * state->lambda; + kvz_quantize_lcu_residual(state, true, reconstruct_chroma, + x, y, depth, + NULL, + lcu, + false); + + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + + double temp_bits = 0; if(cbf) { - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu, &bits); + *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); } } + else { + // If we have no coeffs after quant we already have the cost calculated + *inter_cost = no_cbf_cost; + if(cur_cu->merged) { + *inter_bitcost += no_cbf_bits; + } + return; + } FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - *inter_cost += *inter_bitcost * state->lambda; + *inter_cost += (*inter_bitcost +bits )* state->lambda; + + if(no_cbf_cost < *inter_cost && 0) { + cur_cu->cbf = 0; + if (cur_cu->merged) { + cur_cu->skipped = 1; + } + kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); + *inter_cost = no_cbf_cost; + if (cur_cu->merged) { + *inter_bitcost += no_cbf_bits; + } + } + else if(cur_cu->merged) { + if (cur_cu->merged) { + *inter_bitcost += bits; + } + } } @@ -2267,7 +2349,8 @@ void kvz_search_cu_smp(encoder_state_t * const state, // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, + x, y, depth, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), lcu, inter_cost, inter_bitcost); diff --git a/src/search_inter.h b/src/search_inter.h index da547d90..41988033 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -94,8 +94,11 @@ unsigned kvz_inter_satd_cost(const encoder_state_t* state, int y); void kvz_cu_cost_inter_rd2(encoder_state_t* const state, int x, int y, int depth, + cu_info_t* cur_cu, lcu_t* lcu, double* inter_cost, double* inter_bitcost); +int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a); + #endif // SEARCH_INTER_H_ From a0e7165df4048c466403e784df52231707ff4081 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 31 Jan 2022 08:33:31 +0200 Subject: [PATCH 068/135] use correct pu for rd calc --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index d1a031ac..42a577ba 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2085,7 +2085,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, kvz_quantize_lcu_residual(state, true, reconstruct_chroma, x, y, depth, - NULL, + cur_cu, lcu, false); From a9255901d9e2b2a8b52d7e2dc6e481d33e4b782f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 31 Jan 2022 09:31:44 +0200 Subject: [PATCH 069/135] Only perform rd2 calculation on the best candidate of the list if it exists But only for 2Nx2N blocks --- src/search_inter.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 42a577ba..92d96303 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1699,7 +1699,7 @@ static void search_pu_inter(encoder_state_t * const state, kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); - if(state->encoder_control->cfg.rdo >= 2) { + if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); } else { @@ -1828,9 +1828,9 @@ static void search_pu_inter(encoder_state_t * const state, } } - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); } // Fractional-pixel motion estimation. @@ -1883,7 +1883,7 @@ static void search_pu_inter(encoder_state_t * const state, unipred_pu->inter.mv[list][1] = frac_mv.y; CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); - if (state->encoder_control->cfg.rdo >= 2) { + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); } @@ -2009,7 +2009,7 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); - if (state->encoder_control->cfg.rdo >= 2) { + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } @@ -2051,6 +2051,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, const int x_px = SUB_SCU(x); const int y_px = SUB_SCU(y); const int width = LCU_WIDTH >> depth; + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + *cur_pu = *cur_cu; const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); @@ -2063,12 +2065,11 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; double ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); + width / 2); double ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); - ssd += ssd_u + ssd_v; - ssd *= KVZ_CHROMA_MULT; + width / 2); + ssd += (ssd_u + ssd_v) * KVZ_CHROMA_MULT; } double no_cbf_bits; double bits = 0; From ff02a84a96bc785449f3ddd58fb66b3989f20518 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 13:20:51 +0200 Subject: [PATCH 070/135] Probably better order of things --- src/search_inter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 92d96303..b504ed57 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1697,10 +1697,18 @@ static void search_pu_inter(encoder_state_t * const state, } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_INTER; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + if(state->encoder_control->cfg.early_skip && merge->unit[merge->size].skipped) { + + } } else { merge->cost[merge->size] = kvz_satd_any_size(width, height, @@ -1712,11 +1720,6 @@ static void search_pu_inter(encoder_state_t * const state, merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; - merge->unit[merge->size] = *cur_pu; - merge->unit[merge->size].type = CU_INTER; - merge->unit[merge->size].merge_idx = merge_idx; - merge->unit[merge->size].merged = true; - merge->unit[merge->size].skipped = false; merge->size++; } From 71b1e59548d896868e184593405901440a3d6258 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 28 Jan 2022 13:24:57 +0200 Subject: [PATCH 071/135] Better early-skip? --- src/search_inter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index b504ed57..0bbca858 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1707,7 +1707,12 @@ static void search_pu_inter(encoder_state_t * const state, if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); if(state->encoder_control->cfg.early_skip && merge->unit[merge->size].skipped) { - + *cur_pu = merge->unit[merge->size]; + merge->unit[0] = *cur_pu; + merge->size = 1; + merge->cost[0] = merge->cost[merge->size]; + merge->bits[0] = bits; + return; } } else { @@ -1732,7 +1737,7 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N && cfg->rdo < 2) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { // Reconstruct blocks with merge candidate. @@ -2115,7 +2120,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, *inter_cost += (*inter_bitcost +bits )* state->lambda; - if(no_cbf_cost < *inter_cost && 0) { + if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; if (cur_cu->merged) { cur_cu->skipped = 1; From c7174b25cf8204f6761833485b01f9defa0d1c08 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 1 Feb 2022 14:16:38 +0200 Subject: [PATCH 072/135] smp/amp CUs cannot be skipped --- src/search_inter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 0bbca858..dfd5563c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2082,7 +2082,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, double no_cbf_bits; double bits = 0; int skip_context = kvz_get_skip_context(x, y, lcu, NULL); - if (cur_cu->merged) { + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); } @@ -2110,7 +2110,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, else { // If we have no coeffs after quant we already have the cost calculated *inter_cost = no_cbf_cost; - if(cur_cu->merged) { + if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { *inter_bitcost += no_cbf_bits; } return; @@ -2122,12 +2122,12 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; - if (cur_cu->merged) { + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { cur_cu->skipped = 1; } kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); *inter_cost = no_cbf_cost; - if (cur_cu->merged) { + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { *inter_bitcost += no_cbf_bits; } } From 3ac17ffd9525967e5c4e8bc7cbb3f703993b7204 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 2 Feb 2022 09:51:25 +0200 Subject: [PATCH 073/135] better early skip? --- src/search_inter.c | 80 +++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index dfd5563c..cb7c9683 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1706,14 +1706,6 @@ static void search_pu_inter(encoder_state_t * const state, double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); - if(state->encoder_control->cfg.early_skip && merge->unit[merge->size].skipped) { - *cur_pu = merge->unit[merge->size]; - merge->unit[0] = *cur_pu; - merge->size = 1; - merge->cost[0] = merge->cost[merge->size]; - merge->bits[0] = bits; - return; - } } else { merge->cost[merge->size] = kvz_satd_any_size(width, height, @@ -1737,41 +1729,49 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N && cfg->rdo < 2) { + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { - - // Reconstruct blocks with merge candidate. - // Check luma CBF. Then, check chroma CBFs if luma CBF is not set - // and chroma exists. - // Early terminate if merge candidate with zero CBF is found. - int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; - cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; - cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; - cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; - cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; - cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; - cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; - cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; - kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); - kvz_inter_recon_cu(state, lcu, x, y, width, true, false); - kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); - - if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { - continue; + if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { + merge->size = 1; + merge->bits[0] = merge->bits[merge->keys[merge_key]]; + merge->cost[0] = merge->cost[merge->keys[merge_key]]; + merge->unit[0] = merge->unit[merge->keys[merge_key]]; + merge->keys[0] = 0; } - else if (has_chroma) { - kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); - if (!cbf_is_set_any(cur_pu->cbf, depth)) { - cur_pu->type = CU_INTER; - cur_pu->merge_idx = merge_idx; - cur_pu->skipped = true; + else if(cfg->rdo < 2) { + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; + kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + kvz_inter_recon_cu(state, lcu, x, y, width, true, false); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); - merge->size = 1; - merge->cost[0] = 0.0; // TODO: Check this - merge->bits[0] = merge_idx; // TODO: Check this - merge->unit[0] = *cur_pu; - return; + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + continue; + } + else if (has_chroma) { + kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_INTER; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; + return; + } } } } From 8cd81e3dcf5c6039816d1781ff3dcee8d3daf077 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 2 Feb 2022 10:11:40 +0200 Subject: [PATCH 074/135] Only count smp extra cbf bits when rd < 2 --- src/search_inter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index cb7c9683..abeff412 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2351,7 +2351,9 @@ void kvz_search_cu_smp(encoder_state_t * const state, // The transform is split for SMP and AMP blocks so we need more bits for // coding the CBF. - smp_extra_bits += 6; + if(state->encoder_control->cfg.rdo < 2) { + smp_extra_bits += 6; + } *inter_bitcost += smp_extra_bits; From 49c8334dd7b88279892a41a1427982463155a3b0 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 2 Feb 2022 13:31:59 +0200 Subject: [PATCH 075/135] count skip flag --- src/search.c | 1 + src/search_inter.c | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/search.c b/src/search.c index 185e89fa..b4581fe7 100644 --- a/src/search.c +++ b/src/search.c @@ -785,6 +785,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if(state->frame->slicetype != KVZ_SLICE_I) { double pred_mode_type_bits = 0; CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag"); intra_cost += pred_mode_type_bits * state->lambda; } if (intra_cost < cost) { diff --git a/src/search_inter.c b/src/search_inter.c index abeff412..e16ac483 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2021,7 +2021,15 @@ static void search_pu_inter(encoder_state_t * const state, kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } - + const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); + for(int i = 0; i < 3; i++) { + if(amvp[i].size > 0) { + const uint8_t best_key = amvp[i].keys[0]; + amvp[i].bits[best_key] += no_skip_flag; + amvp[i].cost[best_key] += no_skip_flag * state->lambda; + } + } } /** @@ -2081,14 +2089,15 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, } double no_cbf_bits; double bits = 0; - int skip_context = kvz_get_skip_context(x, y, lcu, NULL); + const int skip_context = kvz_get_skip_context(x, y, lcu, NULL); + double no_skip_flag_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; } else { - no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0); - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1); + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0) + no_skip_flag_bits; + bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; } double no_cbf_cost = ssd + (no_cbf_bits + *inter_bitcost) * state->lambda; @@ -2118,7 +2127,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - *inter_cost += (*inter_bitcost +bits )* state->lambda; + *inter_cost += (*inter_bitcost + bits)* state->lambda; if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; @@ -2131,10 +2140,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, *inter_bitcost += no_cbf_bits; } } - else if(cur_cu->merged) { - if (cur_cu->merged) { - *inter_bitcost += bits; - } + else if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + *inter_bitcost += no_skip_flag_bits; } } @@ -2349,6 +2356,8 @@ void kvz_search_cu_smp(encoder_state_t * const state, depth ); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, smp_extra_bits, "skip_flag"); + // The transform is split for SMP and AMP blocks so we need more bits for // coding the CBF. if(state->encoder_control->cfg.rdo < 2) { From 2ac9daf6e4ce5a937fe117e25d5870441a39d7d1 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 3 Feb 2022 10:02:48 +0200 Subject: [PATCH 076/135] accurate inter bit cost during search --- src/search_inter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index e16ac483..55d6c3f2 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2067,6 +2067,10 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, const int x_px = SUB_SCU(x); const int y_px = SUB_SCU(y); const int width = LCU_WIDTH >> depth; + cabac_data_t cabac_copy; + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); + cabac_copy.update = 1; + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); *cur_pu = *cur_cu; @@ -2090,16 +2094,15 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, double no_cbf_bits; double bits = 0; const int skip_context = kvz_get_skip_context(x, y, lcu, NULL); - double no_skip_flag_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 0); if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1); - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost; + bits += kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); } else { - no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 0) + no_skip_flag_bits; - bits += CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_qt_root_cbf_model, 1) + no_skip_flag_bits; + no_cbf_bits = kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); + bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 1); } - double no_cbf_cost = ssd + (no_cbf_bits + *inter_bitcost) * state->lambda; + double no_cbf_cost = ssd + no_cbf_bits * state->lambda; kvz_quantize_lcu_residual(state, true, reconstruct_chroma, x, y, depth, @@ -2120,14 +2123,15 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, // If we have no coeffs after quant we already have the cost calculated *inter_cost = no_cbf_cost; if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost += no_cbf_bits; + *inter_bitcost = no_cbf_bits; } return; } FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - *inter_cost += (*inter_bitcost + bits)* state->lambda; + *inter_cost += (bits)* state->lambda; + *inter_bitcost = bits; if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; @@ -2136,12 +2140,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, } kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); *inter_cost = no_cbf_cost; - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost += no_cbf_bits; - } - } - else if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost += no_skip_flag_bits; + *inter_bitcost = no_cbf_bits; + } } From d720305feacfdaf650d6767e7d8f838eaae54902 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 3 Feb 2022 11:45:12 +0200 Subject: [PATCH 077/135] Don't double count some of the bits --- src/search.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/search.c b/src/search.c index b4581fe7..a320f5be 100644 --- a/src/search.c +++ b/src/search.c @@ -451,9 +451,6 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; - if(pred_cu->type == CU_INTER && !pred_cu->skipped && depth == pred_cu->depth) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_qt_root_cbf_model, cbf_is_set_any(pred_cu->cbf, depth), tr_tree_bits, "root_cbf"); - } // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; @@ -753,9 +750,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); } } - double pred_mode_type_bits = 0; - CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 0, pred_mode_type_bits, "pred_mode_flag"); - cost += pred_mode_type_bits * state->lambda; } } From d1ba62aea9ab2025c11700e2b7c9a922ab3aabc0 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 4 Feb 2022 10:25:16 +0200 Subject: [PATCH 078/135] Better inter bit_costs for rdo < 2 --- src/encode_coding_tree.c | 10 ------ src/search.c | 16 ++++++++-- src/search_inter.c | 69 +++++++++++++++++++++------------------- 3 files changed, 50 insertions(+), 45 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index ffd8ae1e..d7b80fb7 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1058,16 +1058,6 @@ double kvz_mock_encode_coding_unit( kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits); } - - { - int cbf = cbf_is_set_any(cur_cu->cbf, depth); - // Only need to signal coded block flag if not skipped or merged - // skip = no coded residual, merge = coded residual - if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, bits, "rqt_root_cbf"); - } - - } } else if (cur_cu->type == CU_INTRA) { encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); diff --git a/src/search.c b/src/search.c index a320f5be..ef0587eb 100644 --- a/src/search.c +++ b/src/search.c @@ -452,6 +452,15 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + { + int cbf = cbf_is_set_any(pred_cu->cbf, depth); + // Only need to signal coded block flag if not skipped or merged + // skip = no coded residual, merge = coded residual + if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); + } + + } // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; int max_tr_depth; @@ -851,9 +860,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU - if (inter_bitcost > 1) { - inter_bitcost -= 1; - } + int skip_ctx = kvz_get_skip_context(x, y, lcu, NULL); + inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1); + inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0); + inter_bitcost += cur_cu->merge_idx; } } lcu_fill_inter(lcu, x_local, y_local, cu_width); diff --git a/src/search_inter.c b/src/search_inter.c index 55d6c3f2..1c8e2fd0 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1666,6 +1666,7 @@ static void search_pu_inter(encoder_state_t * const state, } const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0); // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { @@ -1711,6 +1712,7 @@ static void search_pu_inter(encoder_state_t * const state, merge->cost[merge->size] = kvz_satd_any_size(width, height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + bits += no_skip_flag; } // Add cost of coding the merge index merge->cost[merge->size] += bits * info->state->lambda_sqrt; @@ -1836,11 +1838,6 @@ static void search_pu_inter(encoder_state_t * const state, } } - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); - } - // Fractional-pixel motion estimation. // Refine the best PUs so far from both lists, if available. for (int list = 0; list < 2; ++list) { @@ -1914,6 +1911,11 @@ static void search_pu_inter(encoder_state_t * const state, amvp[list].size = n_best; } + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) { + if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + } + // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B && cfg->bipred @@ -2021,13 +2023,16 @@ static void search_pu_inter(encoder_state_t * const state, kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } - const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); - const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); - for(int i = 0; i < 3; i++) { - if(amvp[i].size > 0) { - const uint8_t best_key = amvp[i].keys[0]; - amvp[i].bits[best_key] += no_skip_flag; - amvp[i].cost[best_key] += no_skip_flag * state->lambda; + if(cfg->rdo < 2) { + const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); + const double part_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1); + for(int i = 0; i < 3; i++) { + if(amvp[i].size > 0) { + const uint8_t best_key = amvp[i].keys[0]; + amvp[i].bits[best_key] += no_skip_flag + part_mode_bits; + amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda; + } } } } @@ -2256,7 +2261,7 @@ void kvz_search_cu_inter(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void kvz_search_cu_smp(encoder_state_t * const state, +void kvz_search_cu_smp(encoder_state_t* const state, int x, int y, int depth, part_mode_t part_mode, @@ -2281,19 +2286,19 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost = 0; *inter_bitcost = 0; - + for (int i = 0; i < num_pu; ++i) { const int x_pu = PU_GET_X(part_mode, width, x_local, i); const int y_pu = PU_GET_Y(part_mode, width, y_local, i); const int width_pu = PU_GET_W(part_mode, width, i); const int height_pu = PU_GET_H(part_mode, width, i); - double cost = MAX_DOUBLE; + double cost = MAX_DOUBLE; double bitcost = MAX_INT; search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); - cu_info_t *best_inter_pu = NULL; + cu_info_t* best_inter_pu = NULL; // Find best AMVP PU for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { @@ -2301,7 +2306,7 @@ void kvz_search_cu_smp(encoder_state_t * const state, int best_key = amvp[mv_dir - 1].keys[0]; if (amvp[mv_dir - 1].size > 0 && - amvp[mv_dir - 1].cost[best_key] < cost) { + amvp[mv_dir - 1].cost[best_key] < cost) { best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; cost = amvp[mv_dir - 1].cost[best_key]; @@ -2329,12 +2334,12 @@ void kvz_search_cu_smp(encoder_state_t * const state, *inter_cost += cost; *inter_bitcost += bitcost; - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); *cur_pu = *best_inter_pu; for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { - cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y); + cu_info_t* scu = LCU_GET_CU_AT_PX(lcu, x, y); scu->type = CU_INTER; scu->inter = cur_pu->inter; } @@ -2348,23 +2353,23 @@ void kvz_search_cu_smp(encoder_state_t * const state, assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } } + double smp_extra_bits = 0; + if (state->encoder_control->cfg.rdo < 2) { + smp_extra_bits = kvz_encode_part_mode( + state, + &state->search_cabac, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), + depth + ); - double smp_extra_bits = kvz_encode_part_mode( - state, - &state->search_cabac, - LCU_GET_CU_AT_PX(lcu, x_local, y_local), - depth - ); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, smp_extra_bits, "skip_flag"); - CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, smp_extra_bits, "skip_flag"); - - // The transform is split for SMP and AMP blocks so we need more bits for - // coding the CBF. - if(state->encoder_control->cfg.rdo < 2) { + // The transform is split for SMP and AMP blocks so we need more bits for + // coding the CBF. smp_extra_bits += 6; - } - *inter_bitcost += smp_extra_bits; + *inter_bitcost += smp_extra_bits; + } // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { From e0ed91658b4a7d3122b70d2fc3fc129dbc904605 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 8 Feb 2022 08:11:23 +0200 Subject: [PATCH 079/135] Fix no-early-skip without breaking early-skip --- src/search_inter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 1c8e2fd0..430a40c9 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1713,9 +1713,9 @@ static void search_pu_inter(encoder_state_t * const state, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); bits += no_skip_flag; + merge->cost[merge->size] += bits * info->state->lambda_sqrt; } // Add cost of coding the merge index - merge->cost[merge->size] += bits * info->state->lambda_sqrt; merge->bits[merge->size] = bits; merge->keys[merge->size] = merge->size; @@ -2127,9 +2127,8 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, else { // If we have no coeffs after quant we already have the cost calculated *inter_cost = no_cbf_cost; - if(cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - *inter_bitcost = no_cbf_bits; - } + cur_cu->cbf = 0; + *inter_bitcost = no_cbf_bits; return; } @@ -2143,7 +2142,6 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { cur_cu->skipped = 1; } - kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); *inter_cost = no_cbf_cost; *inter_bitcost = no_cbf_bits; @@ -2233,7 +2231,9 @@ void kvz_search_cu_inter(encoder_state_t * const state, const int y_local = SUB_SCU(y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; - + + kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), + true, state->encoder_control->chroma_format != KVZ_CSP_400); if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); From 00516d3dceaffedc2977d6c6086d2574a87281a2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 9 Feb 2022 09:51:21 +0200 Subject: [PATCH 080/135] Make sure intra does not accidentally skip coeff cost calculation --- src/search_intra.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/search_intra.c b/src/search_intra.c index 2986f67f..07dfb798 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -605,6 +605,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.depth = depth; pred_cu.type = CU_INTRA; pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); + pred_cu.skipped = 0; + pred_cu.merged = 0; pred_cu.intra.mode = modes[rdo_mode]; pred_cu.intra.mode_chroma = modes[rdo_mode]; FILL(pred_cu.cbf, 0); From b0037b814d20106c33ce5cc4885ec9b400604962 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 14 Mar 2022 12:15:03 +0200 Subject: [PATCH 081/135] Use correct lambda # Conflicts: # src/search_inter.c --- src/search_inter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_inter.c b/src/search_inter.c index 430a40c9..c2c69c00 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2031,7 +2031,7 @@ static void search_pu_inter(encoder_state_t * const state, if(amvp[i].size > 0) { const uint8_t best_key = amvp[i].keys[0]; amvp[i].bits[best_key] += no_skip_flag + part_mode_bits; - amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda; + amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda_sqrt; } } } From 1ae5ecdec5eb024c7e040399db06992de011d6dc Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 14 Mar 2022 12:17:59 +0200 Subject: [PATCH 082/135] include pred_mode_bits for 2Nx2N inter pus for rd=0/1 # Conflicts: # src/search_inter.c --- src/search_inter.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index c2c69c00..abc44278 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2026,12 +2026,16 @@ static void search_pu_inter(encoder_state_t * const state, if(cfg->rdo < 2) { const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); - const double part_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1); + const double part_mode_bits = state->encoder_control->cfg.smp_enable || state->encoder_control->cfg.amp_enable ? + CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1) + : 0; + const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model, 0); + const double total_bits = no_skip_flag + part_mode_bits + pred_mode_bits; for(int i = 0; i < 3; i++) { if(amvp[i].size > 0) { const uint8_t best_key = amvp[i].keys[0]; - amvp[i].bits[best_key] += no_skip_flag + part_mode_bits; - amvp[i].cost[best_key] += (no_skip_flag + part_mode_bits)* state->lambda_sqrt; + amvp[i].bits[best_key] += total_bits; + amvp[i].cost[best_key] += (total_bits)* state->lambda_sqrt; } } } From 352d6750f583f4c94f009ad6425045dd8ec275a8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 14 Mar 2022 14:33:36 +0200 Subject: [PATCH 083/135] Remove instrumentation code --- src/cabac.c | 3 -- src/cabac.h | 14 ++------ src/encode_coding_tree.c | 7 ---- src/encoderstate.c | 2 -- src/sao.c | 3 -- src/search.c | 74 ++++++++++++++-------------------------- src/search.h | 12 +++---- src/search_inter.c | 12 +++---- src/search_intra.c | 29 ++++++---------- 9 files changed, 48 insertions(+), 108 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index ae31fb0b..7cd7d926 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -37,8 +37,6 @@ #include "extras/crypto.h" #include "kvazaar.h" -FILE* bit_cost_file = NULL; - const uint8_t kvz_g_auc_next_state_mps[128] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, @@ -107,7 +105,6 @@ void kvz_cabac_encode_bin(cabac_data_t * const data, const uint32_t bin_value) { uint32_t lps; - if (!(data)->only_count) bits_written += CTX_ENTROPY_FBITS((data)->cur_ctx, (bin_value)); lps = kvz_g_auc_lpst_table[CTX_STATE(data->cur_ctx)][(data->range >> 6) & 3]; data->range -= lps; diff --git a/src/cabac.h b/src/cabac.h index 6c46011b..b15cbb75 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -42,8 +42,6 @@ #include "bitstream.h" -extern FILE* bit_cost_file; - struct encoder_state_t; // Types @@ -139,7 +137,6 @@ extern const float kvz_f_entropy_bits[128]; CABAC_BIN((cabac), (val), (name));\ } \ } while(0) -extern double bits_written; // Macros #define CTX_STATE(ctx) ((ctx)->uc_state >> 1) @@ -147,30 +144,23 @@ extern double bits_written; #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps[ (ctx)->uc_state ]; } -#ifdef VERBOSE -#define FILE_BITS(bits, x, y, depth, name) fprintf(bit_cost_file, "%s\t%d\t%d\t%d\t%f\n", (name), (x), (y), (depth), (bits)) -#else -#define FILE_BITS(bits, x, y, depth, name) {} -#endif #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bin((data), (value)); \ - if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u bits = %f\n", \ - (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx), bits_written); } + if(!(data)->only_count) printf("%s = %u, state = %u -> %u MPS = %u\n", \ + (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx)); } #define CABAC_BINS_EP(data, value, bins, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bins_ep((data), (value), (bins)); \ - if(!(data)->only_count) bits_written += (bins); \ if(!(data)->only_count) printf("%s = %u(%u bins), state = %u -> %u\n", \ (name), (uint32_t)(value), (bins), prev_state, (data)->cur_ctx->uc_state); } #define CABAC_BIN_EP(data, value, name) { \ uint32_t prev_state = (data)->cur_ctx->uc_state; \ kvz_cabac_encode_bin_ep((data), (value)); \ - if(!(data)->only_count) bits_written += 1; \ if(!(data)->only_count) printf("%s = %u, state = %u -> %u\n", \ (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state); } #else diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index d7b80fb7..afff8a06 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -943,13 +943,6 @@ end: if (is_last_cu_in_qg(state, x, y, depth)) { state->last_qp = cur_cu->qp; } -#ifdef VERBOSE - if((x % 64 != 0 && y % 64 != 0) || 1) { - fprintf(stderr, "%f\t%d\t%d\t%d\n", bits_written, x, y, depth); - bits_written = 0; - } -#endif - } double kvz_mock_encode_coding_unit( diff --git a/src/encoderstate.c b/src/encoderstate.c index d02ca483..f187ca61 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -1661,11 +1661,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s } } -double bits_written; void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame) { - bits_written = 0; encoder_state_init_new_frame(state, frame); encoder_state_encode(state); diff --git a/src/sao.c b/src/sao.c index b7d76e64..e3154c20 100644 --- a/src/sao.c +++ b/src/sao.c @@ -510,7 +510,6 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (state->encoder_control->cfg.sao_type & 1){ sao_search_edge_sao(state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left); float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt); - FILE_BITS(mode_bits, 0, 0, 0, "sao mode bits"); int ddistortion = (int)(mode_bits * state->lambda + 0.5); unsigned buf_i; @@ -557,7 +556,6 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ { float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left); int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5); - FILE_BITS(mode_bits_none, 0, 0, 0, "Sao cost of nothing"); if (sao_out->ddistortion >= cost_of_nothing) { sao_out->type = SAO_TYPE_NONE; merge_cost[0] = cost_of_nothing; @@ -574,7 +572,6 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ if (merge_cand) { unsigned buf_i; float mode_bits = sao_mode_bits_merge(state, i + 1); - FILE_BITS(mode_bits, 0, 0, 0, (i == 0 ? "sao merge ""left" : "sao merge ""top")); int ddistortion = (int)(mode_bits * state->lambda + 0.5); switch (merge_cand->type) { diff --git a/src/search.c b/src/search.c index ef0587eb..943fd9b9 100644 --- a/src/search.c +++ b/src/search.c @@ -239,10 +239,9 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, * prediction unit data needs to be coded. */ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, - double *bit_cost) + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); @@ -278,17 +277,16 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, { cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search"); - *bit_cost += tr_tree_bits; } if (tr_depth > 0) { int offset = width / 2; double sum = 0; - sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); return sum + tr_tree_bits * state->lambda; } @@ -322,7 +320,6 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); - *bit_cost += tr_tree_bits; } // SSD between reconstruction and original @@ -340,7 +337,6 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode); - *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -349,10 +345,9 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, - double *bit_cost) + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu) { const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; @@ -385,17 +380,16 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } - *bit_cost += tr_tree_bits; } if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); double sum = 0; - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); - sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); return sum + tr_tree_bits * state->lambda; } @@ -420,7 +414,6 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order); coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order); - *bit_cost += coeff_bits; } double bits = tr_tree_bits + coeff_bits; @@ -428,10 +421,9 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, - const int x_px, const int y_px, const int depth, - const cu_info_t* const pred_cu, - lcu_t* const lcu, - double* bit_cost) { + const int x_px, const int y_px, const int depth, + const cu_info_t* const pred_cu, + lcu_t* const lcu) { const int width = LCU_WIDTH >> depth; const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); @@ -492,12 +484,11 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, if (tr_depth > 0) { int offset = LCU_WIDTH >> (depth + 1); double sum = 0; - *bit_cost += tr_tree_bits; - sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, bit_cost); - sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); - sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, bit_cost); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); return sum + tr_tree_bits * state->lambda; } const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ; @@ -514,7 +505,6 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); } - *bit_cost += tr_tree_bits; // SSD between reconstruction and original unsigned luma_ssd = 0; if (!state->encoder_control->cfg.lossless) { @@ -554,7 +544,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order); } } - *bit_cost += coeff_bits; + double bits = tr_tree_bits + coeff_bits; return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda; } @@ -895,13 +885,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost = bits * state->lambda; - cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, &bits); - //if (state->encoder_control->chroma_format != KVZ_CSP_400) { - // cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, & bits); - //} - - FILE_BITS(bits, x, y, depth, "final rd bits"); - + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); + if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { cost = inter_zero_coeff_cost; @@ -958,7 +943,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } - FILE_BITS(split_bits, x, y, depth, "split"); state->search_cabac.update = 0; split_cost += split_bits * state->lambda; @@ -1023,12 +1007,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y) + bits; cost += mode_bits * state->lambda; - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - if (has_chroma) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu, &bits); - } - - FILE_BITS(bits, x, y, depth, "merged intra bits"); + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); @@ -1200,9 +1179,6 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf) { -#ifdef VERBOSE - if (bit_cost_file == NULL) bit_cost_file = fopen("bits_file.txt", "w"); -#endif memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); diff --git a/src/search.h b/src/search.h index bcd517cb..51b30ae4 100644 --- a/src/search.h +++ b/src/search.h @@ -79,13 +79,13 @@ void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); double kvz_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, double *bits); + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu); double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu, double* bits); + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu); void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index abc44278..c275a8bc 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2120,12 +2120,11 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, false); int cbf = cbf_is_set_any(cur_cu->cbf, depth); - - double temp_bits = 0; + if(cbf) { - *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); + *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu, &temp_bits); + *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); } } else { @@ -2135,9 +2134,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, *inter_bitcost = no_cbf_bits; return; } - - FILE_BITS(bits, x, y, depth, "inter rd 2 bits"); - + *inter_cost += (bits)* state->lambda; *inter_bitcost = bits; @@ -2246,7 +2243,6 @@ void kvz_search_cu_inter(encoder_state_t * const state, if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } - FILE_BITS((double)*inter_bitcost, x, y, depth, "regular inter bitcost"); } diff --git a/src/search_intra.c b/src/search_intra.c index 07dfb798..ad469859 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -179,8 +179,7 @@ static double search_intra_trdepth(encoder_state_t * const state, int x_px, int y_px, int depth, int max_depth, int intra_mode, int cost_treshold, cu_info_t *const pred_cu, - lcu_t *const lcu, - double *bit_cost) + lcu_t *const lcu) { assert(depth >= 0 && depth <= MAX_PU_DEPTH); @@ -202,7 +201,6 @@ static double search_intra_trdepth(encoder_state_t * const state, double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; - double nosplit_bits = 0; if (depth > 0) { tr_cu->tr_depth = depth; @@ -223,9 +221,9 @@ static double search_intra_trdepth(encoder_state_t * const state, intra_mode, chroma_mode, pred_cu, lcu); - nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); + nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); if (reconstruct_chroma) { - nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu, &nosplit_bits); + nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); } // Early stop codition for the recursive search. @@ -252,15 +250,15 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth < max_depth && depth < MAX_PU_DEPTH) { split_cost = 0; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, bit_cost); + split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } double tr_split_bit = 0.0; @@ -271,7 +269,6 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth >= 1 && depth <= 3) { cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split"); - *bit_cost += tr_split_bit; } // Add cost of cbf chroma bits on transform tree. @@ -290,7 +287,6 @@ static double search_intra_trdepth(encoder_state_t * const state, if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); } - *bit_cost += cbf_bits; } double bits = tr_split_bit + cbf_bits; @@ -613,9 +609,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - - double bit_costs = 0; - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, &bit_costs); + + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); costs[rdo_mode] += mode_cost; // Early termination if no coefficients has to be coded @@ -640,9 +635,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.intra.mode = modes[0]; pred_cu.intra.mode_chroma = modes[0]; FILL(pred_cu.cbf, 0); - double bit_cost = 0; - search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, &bit_cost); - FILE_BITS(bit_cost, x_px, y_px, depth, "tr_depth bits"); + search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu); } return modes_to_check; @@ -738,7 +731,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, -1, chroma.mode, // skip luma NULL, lcu); double bits = 0; - chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &bits); + chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); bits += mode_bits; From e39fbb11a7981bc7b943b321508b941f41e667d8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 16 Mar 2022 09:14:08 +0200 Subject: [PATCH 084/135] Disable bit calculations that always degrade quality --- src/search.c | 4 ++++ src/search_inter.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/search.c b/src/search.c index 943fd9b9..d1fc19a1 100644 --- a/src/search.c +++ b/src/search.c @@ -775,12 +775,16 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, double intra_cost; kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_cost); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default if(state->frame->slicetype != KVZ_SLICE_I) { double pred_mode_type_bits = 0; CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag"); intra_cost += pred_mode_type_bits * state->lambda; } +#endif if (intra_cost < cost) { cost = intra_cost; cur_cu->type = CU_INTRA; diff --git a/src/search_inter.c b/src/search_inter.c index c275a8bc..d0db3e89 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1666,7 +1666,13 @@ static void search_pu_inter(encoder_state_t * const state, } const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0); +#else + const double no_skip_flag = 0; +#endif // Check motion vector constraints and perform rough search for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { From 9b7dc207b6bbc518345771b352e3dc8fe44bad8c Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 16 Mar 2022 13:50:27 +0200 Subject: [PATCH 085/135] remove unnecessary copying of cabac state --- src/encoderstate.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index f187ca61..6cf40292 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -732,8 +732,6 @@ static void encoder_state_worker_encode_lcu(void * opaque) kvz_bitstream_align_zero(state->cabac.stream); kvz_cabac_start(&state->cabac); - memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); - state->search_cabac.only_count = 1; kvz_crypto_delete(&state->crypto_hdl); } @@ -1218,8 +1216,6 @@ static void encoder_state_init_children(encoder_state_t * const state) { //Leaf states have cabac and context kvz_cabac_start(&state->cabac); kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype); - memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); - state->search_cabac.only_count = 1; } //Clear the jobs From a88553b2065f53d7ea52659c17ab8fbd0634e77a Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 23 Mar 2022 13:39:38 +0200 Subject: [PATCH 086/135] fix jccr and improve intra parameter passing --- src/intra.c | 54 ++++----- src/intra.h | 18 ++- src/search.c | 282 ++++++++++++++++++++++++++++++--------------- src/search.h | 9 ++ src/search_inter.c | 19 ++- src/search_intra.c | 82 +++++++++---- src/search_intra.h | 17 ++- src/transform.c | 39 ++++--- src/transform.h | 20 ++-- 9 files changed, 344 insertions(+), 196 deletions(-) diff --git a/src/intra.c b/src/intra.c index 4c6e204b..9976464b 100644 --- a/src/intra.c +++ b/src/intra.c @@ -452,7 +452,7 @@ static void get_cclm_parameters( } } -static void linear_transform_cclm(cclm_parameters_t* cclm_params, kvz_pixel * src, kvz_pixel * dst, int stride, int height) { +static void linear_transform_cclm(const cclm_parameters_t* cclm_params, kvz_pixel * src, kvz_pixel * dst, int stride, int height) { int scale = cclm_params->a; int shift = cclm_params->shift; int offset = cclm_params->b; @@ -1355,13 +1355,9 @@ static void intra_recon_tb_leaf( int x, int y, int depth, - int8_t intra_mode, - cclm_parameters_t *cclm_params, lcu_t *lcu, color_t color, - uint8_t multi_ref_idx, - bool mip_flag, - bool mip_transp) + const intra_parameters_t* intra_paramas) { const kvz_config *cfg = &state->encoder_control->cfg; const int shift = color == COLOR_Y ? 0 : 1; @@ -1383,7 +1379,7 @@ static void intra_recon_tb_leaf( int x_scu = SUB_SCU(x); int y_scu = SUB_SCU(y); const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift }; - uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0; + uint8_t multi_ref_index = color == COLOR_Y ? intra_paramas->multi_ref_idx : 0; kvz_intra_references refs; // Extra reference lines for use with MRL. Extra lines needed only for left edge. @@ -1409,7 +1405,8 @@ static void intra_recon_tb_leaf( int stride = state->tile->frame->source->stride; const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); bool use_mip = false; - if (mip_flag) { + int8_t intra_mode = color == COLOR_Y ? intra_paramas->luma_mode : intra_paramas->chroma_mode; + if (intra_paramas->mip_flag) { if (color == COLOR_Y) { use_mip = true; } else { @@ -1426,21 +1423,15 @@ static void intra_recon_tb_leaf( if(intra_mode < 68) { if (use_mip) { assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]"); - kvz_mip_predict(state, &refs, width, height, pred, intra_mode, mip_transp); + kvz_mip_predict(state, &refs, width, height, pred, intra_mode, intra_paramas->mip_transp); } else { kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index); } } else { kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width); - if(cclm_params == NULL) { - cclm_parameters_t temp_params; - kvz_predict_cclm( - state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params); - } - else { - linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width); - } + + linear_transform_cclm(&intra_paramas->cclm_parameters[color == COLOR_U ? 0 : 1], pred, pred, width, width); } const int index = lcu_px.x + lcu_px.y * lcu_width; @@ -1487,13 +1478,8 @@ void kvz_intra_recon_cu( int x, int y, int depth, - int8_t mode_luma, - int8_t mode_chroma, + const intra_parameters_t* intra_parameters, cu_info_t *cur_cu, - cclm_parameters_t *cclm_params, - uint8_t multi_ref_idx, - bool mip_flag, - bool mip_transp, lcu_t *lcu) { const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) }; @@ -1501,9 +1487,9 @@ void kvz_intra_recon_cu( if (cur_cu == NULL) { cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); } - uint8_t multi_ref_index = multi_ref_idx; - bool use_mip = mip_flag; - bool mip_transposed = mip_transp; + bool use_mip = intra_parameters->mip_flag; + const int8_t mode_luma = intra_parameters->luma_mode; + const int8_t mode_chroma= intra_parameters->chroma_mode; if (mode_luma != -1 && mode_chroma != -1) { if (use_mip) { @@ -1527,10 +1513,10 @@ void kvz_intra_recon_cu( const int32_t x2 = x + offset; const int32_t y2 = y + offset; - kvz_intra_recon_cu(state, x, y, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); - kvz_intra_recon_cu(state, x2, y, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); - kvz_intra_recon_cu(state, x, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); - kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); + kvz_intra_recon_cu(state, x, y, depth + 1, intra_parameters, NULL, lcu); + kvz_intra_recon_cu(state, x2, y, depth + 1, intra_parameters, NULL, lcu); + kvz_intra_recon_cu(state, x, y2, depth + 1, intra_parameters, NULL, lcu); + kvz_intra_recon_cu(state, x2, y2, depth + 1, intra_parameters, NULL, lcu); // Propagate coded block flags from child CUs to parent CU. uint16_t child_cbfs[3] = { @@ -1552,13 +1538,13 @@ void kvz_intra_recon_cu( // Process a leaf TU. if (has_luma) { - intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, intra_parameters); } if (has_chroma) { - intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed); - intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, intra_parameters); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, intra_parameters); } - kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false); + kvz_quantize_lcu_residual(state, has_luma, has_chroma, intra_parameters->jccr != -1 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0), x, y, depth, cur_cu, lcu, false); } } diff --git a/src/intra.h b/src/intra.h index 4e3542c3..e35f57e5 100644 --- a/src/intra.h +++ b/src/intra.h @@ -63,6 +63,17 @@ typedef struct int16_t b; } cclm_parameters_t; +typedef struct { + int8_t luma_mode; + int8_t chroma_mode; + cclm_parameters_t cclm_parameters[2]; + uint8_t multi_ref_idx; + bool mip_flag; + bool mip_transp; + int8_t mts_idx; + int8_t jccr; +} intra_parameters_t; + /** * \brief Function for deriving intra luma predictions * \param x x-coordinate of the PU in pixels @@ -128,13 +139,8 @@ void kvz_intra_recon_cu( int x, int y, int depth, - int8_t mode_luma, - int8_t mode_chroma, + const intra_parameters_t * intra_parameters, cu_info_t *cur_cu, - cclm_parameters_t* cclm_params, - uint8_t multi_ref_idx, - bool mip_flag, - bool mip_transp, lcu_t *lcu); diff --git a/src/search.c b/src/search.c index 3bd39e6b..da81b0fc 100644 --- a/src/search.c +++ b/src/search.c @@ -384,9 +384,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); double tr_tree_bits = 0; - double joint_cbcr_tr_tree_bits = 0; double coeff_bits = 0; - double joint_coeff_bits = 0; assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); @@ -407,19 +405,12 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); } - if(state->encoder_control->cfg.jccr) { - joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1); - } int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } - if(state->encoder_control->cfg.jccr) { - ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); - joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1); - } } @@ -442,15 +433,10 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, ctx = &(state->cabac.ctx.joint_cb_cr[cbf_mask]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 0); } - if(pred_cu->joint_cb_cr) { - ctx = &(state->cabac.ctx.joint_cb_cr[(pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1]); - joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 1); - } } // Chroma SSD int ssd = 0; - int joint_ssd = 0; if (!state->encoder_control->cfg.lossless) { int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], @@ -460,16 +446,6 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, LCU_WIDTH_C, LCU_WIDTH_C, width); ssd = ssd_u + ssd_v; - - if(state->encoder_control->cfg.jccr) { - int ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], - LCU_WIDTH_C, LCU_WIDTH_C, - width); - int ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], - LCU_WIDTH_C, LCU_WIDTH_C, - width); - joint_ssd = ssd_u_joint + ssd_v_joint; - } } if (!skip_residual_coding) @@ -479,35 +455,12 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0); coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0); - - if(state->encoder_control->cfg.jccr) { - joint_coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0); - } } double bits = tr_tree_bits + coeff_bits; - double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits; - double cost = (double)ssd * KVZ_CHROMA_MULT + bits * state->c_lambda; - double joint_cost = (double)joint_ssd * KVZ_CHROMA_MULT + joint_bits * state->c_lambda; - if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) { - pred_cu->joint_cb_cr = 0; - return cost; - } - cbf_clear(&pred_cu->cbf, depth, COLOR_U); - cbf_clear(&pred_cu->cbf, depth, COLOR_V); - if (pred_cu->joint_cb_cr & 1) { - cbf_set(&pred_cu->cbf, depth, COLOR_U); - } - if (pred_cu->joint_cb_cr & 2) { - cbf_set(&pred_cu->cbf, depth, COLOR_V); - } - int lcu_width = LCU_WIDTH_C; - const int index = lcu_px.x + lcu_px.y * lcu_width; - kvz_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width); - kvz_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width); - return joint_cost; + return (double)ssd * KVZ_CHROMA_MULT + bits * state->c_lambda; } static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, @@ -577,6 +530,16 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); } + + if (cb_flag_y | cb_flag_u | cb_flag_v) { + // TODO qp_delta_sign_flag + + if ((cb_flag_u | cb_flag_v) && x_px % 8 == 0 && y_px % 8 == 0 && state->encoder_control->cfg.jccr) { + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag"); + } + } + + // SSD between reconstruction and original unsigned luma_ssd = 0; if (!state->encoder_control->cfg.lossless) { @@ -597,23 +560,34 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) { const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; - if (!state->encoder_control->cfg.lossless) { - int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; - unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + if(pred_cu->joint_cb_cr != 0) { + if (!state->encoder_control->cfg.lossless) { + int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + chroma_ssd = ssd_u + ssd_v; + } + + { + + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0); + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0); + } + } else { + int ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + int ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], LCU_WIDTH_C, LCU_WIDTH_C, chroma_width); - unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], - LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); - chroma_ssd = ssd_u + ssd_v; - } - - { - int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); - const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); - - coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0); - coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0); + chroma_ssd = ssd_u_joint + ssd_v_joint; + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0); } } @@ -622,6 +596,118 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, } +void kvz_select_jccr_mode( + const encoder_state_t* const state, + const int x_px, + const int y_px, + const int depth, + cu_info_t* pred_cu, + lcu_t* const lcu, + double* cost_out) +{ + const vector2d_t lcu_px = { (SUB_SCU(x_px) & ~7) / 2, (SUB_SCU(y_px) & ~7) / 2 }; + const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + if (pred_cu == NULL) pred_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x * 2, lcu_px.y * 2); + assert(pred_cu->depth == pred_cu->tr_depth && "jccr does not support transform splitting"); + if (cost_out == NULL && pred_cu->joint_cb_cr == 0) { + return; + } + + double tr_tree_bits = 0; + double joint_cbcr_tr_tree_bits = 0; + double coeff_bits = 0; + double joint_coeff_bits = 0; + + assert(lcu_px.x >= 0 && lcu_px.x < LCU_WIDTH_C); + assert(lcu_px.y >= 0 && lcu_px.y < LCU_WIDTH_C); + + if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) { + // For MAX_PU_DEPTH calculate chroma for previous depth for the first + // block and return 0 cost for all others. + return; + } + + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search"); + + int cbf_mask = cbf_is_set(pred_cu->cbf, depth, COLOR_U) * 2 + cbf_is_set(pred_cu->cbf, depth, COLOR_V) - 1; + if(cbf_mask != -1) + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag"); + + if(pred_cu->joint_cb_cr) { + ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cb_search"); + ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); + CABAC_FBITS_UPDATE(cabac, ctx, (pred_cu->joint_cb_cr & 2) >> 1, joint_cbcr_tr_tree_bits, "cbf_cr_search"); + cbf_mask = (pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag"); + } + int ssd = 0; + int joint_ssd = 0; + if (!state->encoder_control->cfg.lossless) { + int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + int ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + int ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + ssd = ssd_u + ssd_v; + + if (pred_cu->joint_cb_cr) { + int ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + int ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + joint_ssd = ssd_u_joint + ssd_v_joint; + } + } + + { + int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + + if (u_is_set) coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0); + if (v_is_set) coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0); + + joint_coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0); + } + + + double bits = tr_tree_bits + coeff_bits; + double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits; + + double cost = (double)ssd * KVZ_CHROMA_MULT + bits * state->c_lambda; + double joint_cost = (double)joint_ssd * KVZ_CHROMA_MULT + joint_bits * state->c_lambda; + if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) { + pred_cu->joint_cb_cr = 0; + if (cost_out) *cost_out += cost; + return; + } + cbf_clear(&pred_cu->cbf, depth, COLOR_U); + cbf_clear(&pred_cu->cbf, depth, COLOR_V); + if (pred_cu->joint_cb_cr & 1) { + cbf_set(&pred_cu->cbf, depth, COLOR_U); + } + if (pred_cu->joint_cb_cr & 2) { + cbf_set(&pred_cu->cbf, depth, COLOR_V); + } + int lcu_width = LCU_WIDTH_C; + const int index = lcu_px.x + lcu_px.y * lcu_width; + kvz_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width); + kvz_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width); + if (cost_out) *cost_out += joint_cost; +} + + // Return estimate of bits used to code prediction mode of cur_cu. static double calc_mode_bits(const encoder_state_t *state, const lcu_t *lcu, @@ -885,15 +971,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) && !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I); + intra_parameters_t intra_parameters; if (can_use_intra && !skip_intra) { - int8_t intra_mode; - int8_t intra_trafo; double intra_cost; - uint8_t multi_ref_index = 0; - bool mip_flag = false; - bool mip_transposed = false; + intra_parameters.jccr = -1; kvz_search_cu_intra(state, x, y, depth, lcu, - &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed); + &intra_cost, &intra_parameters); #ifdef COMPLETE_PRED_MODE_BITS // Technically counting these bits would be correct, however counting // them universally degrades quality so this block is disabled by default @@ -908,13 +991,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost = intra_cost; cur_cu->type = CU_INTRA; cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N; - cur_cu->intra.mode = intra_mode; - cur_cu->intra.multi_ref_idx = multi_ref_index; - cur_cu->intra.mip_flag = mip_flag; - cur_cu->intra.mip_is_transposed = mip_transposed; + cur_cu->intra.mode = intra_parameters.luma_mode; + cur_cu->intra.multi_ref_idx = intra_parameters.multi_ref_idx; + cur_cu->intra.mip_flag = intra_parameters.mip_flag; + cur_cu->intra.mip_is_transposed = intra_parameters.mip_transp; //If the CU is not split from 64x64 block, the MTS is disabled for that CU. - cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0; + cur_cu->tr_idx = (depth > 0) ? intra_parameters.mts_idx : 0; } } @@ -925,12 +1008,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->intra.mode_chroma = cur_cu->intra.mode; lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + intra_parameters.chroma_mode = -1; kvz_intra_recon_cu(state, x, y, depth, - cur_cu->intra.mode, -1, // skip chroma - NULL, NULL, cur_cu->intra.multi_ref_idx, - cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, + &intra_parameters, + NULL, lcu); downsample_cclm_rec( @@ -943,19 +1026,27 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - cclm_parameters_t cclm_params[2]; if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) { - cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params); + cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, intra_parameters.cclm_parameters); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } - + intra_parameters.chroma_mode = cur_cu->intra.mode_chroma; + intra_parameters.luma_mode = -1; // skip luma + intra_parameters.jccr = 0; kvz_intra_recon_cu(state, x & ~7, y & ~7, // TODO: as does this depth, - -1, cur_cu->intra.mode_chroma, // skip luma - NULL, cclm_params, 0, - cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, + &intra_parameters, + NULL, lcu); + if(depth != 0 && state->encoder_control->cfg.jccr) { + kvz_select_jccr_mode(state, + x & ~7, y & ~7, + depth, + NULL, + lcu, + NULL); + } } } else if (cur_cu->type == CU_INTER) { @@ -983,11 +1074,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } kvz_quantize_lcu_residual(state, - true, has_chroma, - x, y, depth, - NULL, - lcu, - false); + true, has_chroma, + state->encoder_control->cfg.jccr, x, y, + depth, + NULL, + lcu, + false); int cbf = cbf_is_set_any(cur_cu->cbf, depth); @@ -1142,11 +1234,21 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1; + intra_parameters_t intra_parameters = { + .luma_mode = cur_cu->intra.mode, + .chroma_mode = mode_chroma, + .cclm_parameters ={{0, 0, 0}, {0, 0 ,0}}, + 0, + 0, + 0, + 0, + -1, + }; kvz_intra_recon_cu(state, x, y, depth, - cur_cu->intra.mode, mode_chroma, - NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, + &intra_parameters, + NULL, lcu); double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits; diff --git a/src/search.h b/src/search.h index db87c298..ba4ca57c 100644 --- a/src/search.h +++ b/src/search.h @@ -91,6 +91,15 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, const int x_px, const int y_px, const int depth, cu_info_t *const pred_cu, lcu_t *const lcu); +void kvz_select_jccr_mode( + const encoder_state_t* const state, + const int x_px, + const int y_px, + const int depth, + cu_info_t* const pred_cu, + lcu_t* const lcu, + double* cost_out); + void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index 73e15f95..c2203b72 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1800,14 +1800,16 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); kvz_inter_recon_cu(state, lcu, x, y, width, true, false); - kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); + kvz_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true); if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { continue; } else if (has_chroma) { kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); + kvz_quantize_lcu_residual(state, false, has_chroma, + false, /*we are only checking for lack of coeffs so no need to check jccr*/ + x, y, depth, cur_pu, lcu, true); if (!cbf_is_set_any(cur_pu->cbf, depth)) { cur_pu->type = CU_INTER; cur_pu->merge_idx = merge_idx; @@ -2159,8 +2161,10 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, } double no_cbf_cost = ssd + no_cbf_bits * state->lambda; - kvz_quantize_lcu_residual(state, true, reconstruct_chroma, - x, y, depth, + kvz_quantize_lcu_residual(state, + true, reconstruct_chroma, + reconstruct_chroma && state->encoder_control->cfg.jccr, x, y, + depth, cur_cu, lcu, false); @@ -2170,7 +2174,12 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, if(cbf) { *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); + if (cur_cu->depth != cur_cu->tr_depth) { + *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); + } + else { + kvz_select_jccr_mode(state, x_px, y_px, depth, cur_cu, lcu, inter_cost); + } } } else { diff --git a/src/search_intra.c b/src/search_intra.c index 6f7a9349..bd1433d9 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -315,8 +315,21 @@ static double search_intra_trdepth(encoder_state_t * const state, if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) { num_transforms = MAX(num_transforms, 2); } + + intra_parameters_t intra_parameters = { + .luma_mode = intra_mode, + -1, + {{0, 0, 0}, {0, 0 ,0}}, + pred_cu->intra.multi_ref_idx, + pred_cu->intra.mip_flag, + pred_cu->intra.mip_is_transposed, + 0, + -1, + }; + for (; trafo < num_transforms; trafo++) { pred_cu->tr_idx = trafo; + intra_parameters.mts_idx = trafo; if (mts_enabled) { pred_cu->mts_last_scan_pos = 0; @@ -332,9 +345,8 @@ static double search_intra_trdepth(encoder_state_t * const state, kvz_intra_recon_cu(state, x_px, y_px, depth, - intra_mode, -1, - pred_cu, cclm_params, pred_cu->intra.multi_ref_idx, - pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed, + &intra_parameters, + pred_cu, lcu); // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1 @@ -359,12 +371,14 @@ static double search_intra_trdepth(encoder_state_t * const state, } } if(reconstruct_chroma) { + intra_parameters.luma_mode = -1; + intra_parameters.chroma_mode = chroma_mode; + intra_parameters.jccr = -1; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently kvz_intra_recon_cu(state, x_px, y_px, depth, - -1, chroma_mode, - pred_cu, cclm_params, 0, - pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed, + &intra_parameters, + pred_cu, lcu); best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); } @@ -1020,22 +1034,32 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, double cost; int8_t mode; cclm_parameters_t cclm[2]; + int8_t jccr; } chroma, best_chroma; // chroma.cclm = cclm_params; best_chroma.mode = 0; best_chroma.cost = MAX_INT; + best_chroma.jccr = 0; + + intra_parameters_t intra_parameters; + memset(&intra_parameters, 0, sizeof(intra_parameters_t)); + intra_parameters.luma_mode = -1; // skip luma + + chroma.jccr = 0; for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) { chroma.mode = modes[chroma_mode_i]; if (chroma.mode == -1) continue; + intra_parameters.chroma_mode = modes[chroma_mode_i]; if(chroma.mode < 67 || depth == 0) { kvz_intra_recon_cu(state, x_px, y_px, depth, - -1, chroma.mode, // skip luma - NULL, NULL, 0, false, false, lcu); + &intra_parameters, + NULL, + lcu); } else { @@ -1050,6 +1074,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, &cclm_params[0]); chroma.cclm[0] = cclm_params[0]; + intra_parameters.cclm_parameters[0] = cclm_params[0]; kvz_predict_cclm( state, COLOR_V, @@ -1062,16 +1087,23 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, &cclm_params[1]); chroma.cclm[1] = cclm_params[1]; + intra_parameters.cclm_parameters[1] = cclm_params[1]; kvz_intra_recon_cu( state, x_px, y_px, depth, - -1, chroma.mode, // skip luma - NULL, cclm_params, 0, false, false, lcu); + &intra_parameters, + NULL, + lcu); } double bits = 0; - chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + if(tr_cu->depth != tr_cu->tr_depth) { + chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + } else { + kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &chroma.cost); + chroma.jccr = tr_cu->joint_cb_cr; + } double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); bits += mode_bits; @@ -1083,6 +1115,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, } best_cclm[0] = best_chroma.cclm[0]; best_cclm[1] = best_chroma.cclm[1]; + tr_cu->joint_cb_cr = best_chroma.jccr; return best_chroma.mode; } @@ -1154,15 +1187,14 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, * Update lcu to have best modes at this depth. * \return Cost of best mode. */ -void kvz_search_cu_intra(encoder_state_t * const state, - const int x_px, const int y_px, - const int depth, lcu_t *lcu, - int8_t *mode_out, - int8_t *trafo_out, - double *cost_out, - uint8_t *multi_ref_idx_out, - bool *mip_flag_out, - bool * mip_transposed_out) +void kvz_search_cu_intra( + encoder_state_t * const state, + const int x_px, + const int y_px, + const int depth, + lcu_t *lcu, + double *cost_out, + intra_parameters_t* intra_parameters) { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; const int8_t cu_width = LCU_WIDTH >> depth; @@ -1333,10 +1365,10 @@ void kvz_search_cu_intra(encoder_state_t * const state, tmp_best_mode = (tmp_mip_transp ? tmp_best_mode - (num_mip_modes >> 1) : tmp_best_mode); } - *mode_out = tmp_best_mode; - *trafo_out = tmp_best_trafo; + intra_parameters->luma_mode = tmp_best_mode; + intra_parameters->mts_idx = tmp_best_trafo; *cost_out = tmp_best_cost; - *mip_flag_out = tmp_mip_flag; - *mip_transposed_out = tmp_mip_transp; - *multi_ref_idx_out = tmp_mip_flag ? 0 : best_line; + intra_parameters->mip_flag = tmp_mip_flag; + intra_parameters->mip_transp = tmp_mip_transp; + intra_parameters->multi_ref_idx = tmp_mip_flag ? 0 : best_line; } diff --git a/src/search_intra.h b/src/search_intra.h index 659695b3..13a830e0 100644 --- a/src/search_intra.h +++ b/src/search_intra.h @@ -53,14 +53,13 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, const int x_px, const int y_px, const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm); -void kvz_search_cu_intra(encoder_state_t * const state, - const int x_px, const int y_px, - const int depth, lcu_t *lcu, - int8_t *mode_out, - int8_t *trafo_out, - double *cost_out, - uint8_t *multi_ref_idx_out, - bool *mip_flag, - bool *mip_transp); +void kvz_search_cu_intra( + encoder_state_t * const state, + const int x_px, + const int y_px, + const int depth, + lcu_t *lcu, + double *cost_out, + intra_parameters_t* intra_parameters); #endif // SEARCH_INTRA_H_ diff --git a/src/transform.c b/src/transform.c index 4c90f3f4..c5255b07 100644 --- a/src/transform.c +++ b/src/transform.c @@ -479,15 +479,17 @@ static void quantize_tr_residual(encoder_state_t * const state, * - lcu->cbf coded block flags for the area * - lcu->cu.intra.tr_skip tr skip flags for the area (in case of luma) */ -void kvz_quantize_lcu_residual(encoder_state_t * const state, - const bool luma, - const bool chroma, - const int32_t x, - const int32_t y, - const uint8_t depth, - cu_info_t *cur_pu, - lcu_t* lcu, - bool early_skip) +void kvz_quantize_lcu_residual( + encoder_state_t * const state, + const bool luma, + const bool chroma, + const bool jccr, + const int32_t x, + const int32_t y, + const uint8_t depth, + cu_info_t *cur_pu, + lcu_t* lcu, + bool early_skip) { const int32_t width = LCU_WIDTH >> depth; const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) }; @@ -509,7 +511,7 @@ void kvz_quantize_lcu_residual(encoder_state_t * const state, if (luma) { cbf_clear(&cur_pu->cbf, depth, COLOR_Y); } - if (chroma) { + if (chroma || jccr) { cbf_clear(&cur_pu->cbf, depth, COLOR_U); cbf_clear(&cur_pu->cbf, depth, COLOR_V); } @@ -521,10 +523,11 @@ void kvz_quantize_lcu_residual(encoder_state_t * const state, const int32_t x2 = x + offset; const int32_t y2 = y + offset; - kvz_quantize_lcu_residual(state, luma, chroma, x, y, depth + 1, NULL, lcu, early_skip); - kvz_quantize_lcu_residual(state, luma, chroma, x2, y, depth + 1, NULL, lcu, early_skip); - kvz_quantize_lcu_residual(state, luma, chroma, x, y2, depth + 1, NULL, lcu, early_skip); - kvz_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu, early_skip); + // jccr is currently not supported if transform is split + kvz_quantize_lcu_residual(state, luma, chroma, 0, x, y, depth + 1, NULL, lcu, early_skip); + kvz_quantize_lcu_residual(state, luma, chroma, 0, x2, y, depth + 1, NULL, lcu, early_skip); + kvz_quantize_lcu_residual(state, luma, chroma, 0, x, y2, depth + 1, NULL, lcu, early_skip); + kvz_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip); // Propagate coded block flags from child CUs to parent CU. uint16_t child_cbfs[3] = { @@ -546,10 +549,10 @@ void kvz_quantize_lcu_residual(encoder_state_t * const state, } if (chroma) { quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip); - quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip); - if(state->encoder_control->cfg.jccr && cur_pu->tr_depth == cur_pu->depth){ - quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip); - } + quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip); + } + if (jccr && cur_pu->tr_depth == cur_pu->depth) { + quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip); } } } diff --git a/src/transform.h b/src/transform.h index b2b12689..bba5cabc 100644 --- a/src/transform.h +++ b/src/transform.h @@ -67,14 +67,16 @@ void kvz_itransform2d(const encoder_control_t * const encoder, int32_t kvz_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale); -void kvz_quantize_lcu_residual(encoder_state_t *state, - bool luma, - bool chroma, - int32_t x, - int32_t y, - uint8_t depth, - cu_info_t *cur_cu, - lcu_t* lcu, - bool early_skip); +void kvz_quantize_lcu_residual( + encoder_state_t *state, + bool luma, + bool chroma, + const bool jccr, + int32_t x, + int32_t y, + uint8_t depth, + cu_info_t *cur_cu, + lcu_t* lcu, + bool early_skip); #endif From b2a94d42763f7754aa5da610bab4e03666298e1a Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 24 Mar 2022 12:09:14 +0200 Subject: [PATCH 087/135] Move transform coding and chroma cu coding out of encode_intra_coding_unit --- src/encode_coding_tree.c | 40 +++++++++++++++++++--------------------- src/search_intra.c | 30 +++++++++++++++++------------- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index a6adb249..ffcd2dac 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -814,7 +814,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c } } -static void encode_intra_coding_unit(encoder_state_t * const state, +static void encode_intra_luma_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, int x, int y, int depth, lcu_t* lcu, lcu_coeff_t* coeff, double* bits_out) @@ -1063,24 +1063,6 @@ static void encode_intra_coding_unit(encoder_state_t * const state, if (cabac->only_count && bits_out) *bits_out += 5; } } - - // Code chroma prediction mode. - if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); - } - // if we are counting bits, the cost for transform coeffs is done separately - // To get the distortion at the same time - if (!cabac->only_count) { - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); - - encode_mts_idx(state, cabac, cur_cu); - - if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); - } - } - } /** @@ -1511,7 +1493,23 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, coeff, NULL); + encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, coeff, NULL); + + // Code chroma prediction mode. + if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) { + encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); + } + + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); + + encode_mts_idx(state, cabac, cur_cu); + + // For 4x4 the chroma PU/TU is coded after the last + if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) { + encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); + + } } else { @@ -1640,7 +1638,7 @@ double kvz_mock_encode_coding_unit( } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, NULL, &bits); + encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, NULL, &bits); } return bits; } diff --git a/src/search_intra.c b/src/search_intra.c index bd1433d9..2f7d765c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -915,21 +915,25 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const break; } } - cabac_ctx_t *ctx = &(cabac->ctx.luma_planar_model[1]); - CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search"); - if (state->search_cabac.update) { - if(mode_in_preds) { - CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); - if(luma_mode != intra_preds[0]) { - CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx"); + cabac_ctx_t* ctx = &(cabac->ctx.luma_planar_model[1]); + CABAC_FBITS_UPDATE( + cabac, + ctx, + mode_in_preds != -1, + mode_bits, + "prev_intra_luma_pred_flag_search"); + if (state->search_cabac.update) { + if (mode_in_preds) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); + if (luma_mode != intra_preds[0]) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx"); + } + } else { + // This value should be transformed for actual coding, + // but here the value does not actually matter, just that we write 5 bits + CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode"); } } - else { - // This value should be transformed for actual coding, - // but here the value does not actually matter, just that we write 5 bits - CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode"); - } - } bool enable_mrl = state->encoder_control->cfg.mrl; uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0; From 7aa361696d271701b24fb95fb9de66e3c8d10f4d Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 24 Mar 2022 13:34:26 +0200 Subject: [PATCH 088/135] Facilitate bit counting in encode_intra_luma_coding_unit --- src/alf.c | 6 +- src/cabac.c | 4 +- src/cabac.h | 2 +- src/encode_coding_tree.c | 274 +++++++++++++++++++-------------------- src/search_intra.c | 2 +- 5 files changed, 144 insertions(+), 144 deletions(-) diff --git a/src/alf.c b/src/alf.c index 868616fd..d7cbc8a5 100644 --- a/src/alf.c +++ b/src/alf.c @@ -1236,19 +1236,19 @@ static void code_alf_ctu_filter_index(encoder_state_t * const state, assert(filter_set_idx < num_available_filt_sets); //"temporal non-latest set" if (num_aps > 1) { - kvz_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS); + kvz_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS, NULL); } } else { assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //"fixed set larger than temporal" - kvz_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS); + kvz_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL); } } else { assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //Fixed set numavail < num_fixed - kvz_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS); + kvz_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL); } } diff --git a/src/cabac.c b/src/cabac.c index a35358ae..dd8aeffc 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -200,7 +200,7 @@ void kvz_cabac_encode_bin_trm(cabac_data_t * const data, const uint8_t bin_value /** * \brief encode truncated binary code */ -void kvz_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value) { +void kvz_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value, double* bits_out) { int thresh; int symbol = bin_value; if (max_value > 256) { @@ -220,9 +220,11 @@ void kvz_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_va int b = max_value - val; if (symbol < val - b) { CABAC_BINS_EP(data, symbol, thresh, "TruncSymbols"); + if (bits_out) *bits_out += 1; } else { symbol += val - b; CABAC_BINS_EP(data, symbol, thresh + 1, "TruncSymbols"); + if (bits_out) *bits_out += 1; } } diff --git a/src/cabac.h b/src/cabac.h index 92c2d6b8..eff15220 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -134,7 +134,7 @@ extern const uint8_t kvz_g_auc_renorm_table[32]; void kvz_cabac_start(cabac_data_t *data); void kvz_cabac_encode_bin(cabac_data_t *data, uint32_t bin_value); void kvz_cabac_encode_bin_ep(cabac_data_t *data, uint32_t bin_value); -void kvz_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value); +void kvz_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value, double* bits_out); void kvz_cabac_encode_bins_ep(cabac_data_t *data, uint32_t bin_values, int num_bins); void kvz_cabac_encode_bin_trm(cabac_data_t *data, uint8_t bin_value); void kvz_cabac_write(cabac_data_t *data); diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index ffcd2dac..12d8a98b 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -817,16 +817,17 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c static void encode_intra_luma_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth, lcu_t* lcu, lcu_coeff_t* coeff, double* bits_out) + int x, int y, int depth, lcu_t* lcu, double* bits_out) { const videoframe_t * const frame = state->tile->frame; - uint8_t intra_pred_mode_actual[4]; - uint8_t *intra_pred_mode = intra_pred_mode_actual; + uint8_t intra_pred_mode_actual; + uint8_t *intra_pred_mode = &intra_pred_mode_actual; //uint8_t intra_pred_mode_chroma = cur_cu->intra.mode_chroma; - int8_t intra_preds[4][INTRA_MPM_COUNT] = {{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1}}; - int8_t mpm_preds[4] = {-1, -1, -1, -1}; - uint32_t flag[4]; + int8_t intra_preds[INTRA_MPM_COUNT] = {-1, -1, -1, -1, -1, -1}; + int8_t mpm_preds = -1; + uint32_t flag; + double bits = 0; /* if ((cur_cu->type == CU_INTRA && (LCU_WIDTH >> cur_cu->depth <= 32))) { @@ -850,8 +851,6 @@ static void encode_intra_luma_coding_unit(encoder_state_t * const state, CABAC_BIN(cabac, 0, "bdpcm_mode"); } */ - - const int num_pred_units = kvz_part_mode_num_parts[cur_cu->part_size]; // Intra Subpartition mode uint32_t width = (LCU_WIDTH >> depth); @@ -889,15 +888,17 @@ static void encode_intra_luma_coding_unit(encoder_state_t * const state, if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) { const int cu_width = LCU_WIDTH >> depth; const int cu_height = cu_width; // TODO: height for non-square blocks - uint8_t ctx_id = kvz_get_mip_flag_context(x, y, cu_width, cu_height, NULL, frame->cu_array); + uint8_t ctx_id = kvz_get_mip_flag_context(x, y, cu_width, cu_height, lcu, lcu ? NULL : frame->cu_array); // Write MIP flag - cabac->cur_ctx = &(cabac->ctx.mip_flag[ctx_id]); - CABAC_BIN(cabac, mip_flag, "mip_flag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mip_flag[ctx_id]), mip_flag, bits, "mip_flag"); if (mip_flag) { // Write MIP transpose flag & mode CABAC_BIN_EP(cabac, mip_transpose, "mip_transposed"); - kvz_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes); + if (cabac->only_count) bits += 1; + kvz_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes, bits_out); + if (cabac->only_count) *bits_out += bits; + return; } } @@ -911,158 +912,155 @@ static void encode_intra_luma_coding_unit(encoder_state_t * const state, if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) { if (MAX_REF_LINE_IDX > 1) { - cabac->cur_ctx = &(cabac->ctx.multi_ref_line[0]); - CABAC_BIN(cabac, multi_ref_idx != 0, "multi_ref_line"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.multi_ref_line[0]), multi_ref_idx != 0, bits, "multi_ref_line"); if (MAX_REF_LINE_IDX > 2 && multi_ref_idx != 0) { - cabac->cur_ctx = &(cabac->ctx.multi_ref_line[1]); - CABAC_BIN(cabac, multi_ref_idx != 1, "multi_ref_line") + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.multi_ref_line[1]), multi_ref_idx != 1, bits, "multi_ref_line"); } } } // ToDo: update real usage, these if clauses as such don't make any sense - if (isp_mode != 0 && multi_ref_idx == 0 && !mip_flag) { + if (isp_mode != 0 && multi_ref_idx == 0) { if (isp_mode) { - cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]); - CABAC_BIN(cabac, 0, "intra_subPartitions"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subPartitions"); } else { - cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]); - CABAC_BIN(cabac, 1, "intra_subPartitions"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions"); // ToDo: complete this if-clause if (isp_mode == 3) { - cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[1]); - CABAC_BIN(cabac, allow_isp - 1, "intra_subPart_ver_hor"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor"); } } } const int cu_width = LCU_WIDTH >> depth; - // If MIP is used, skip writing normal intra modes - if (!mip_flag) { // PREDINFO CODING // If intra prediction mode is found from the predictors, // it can be signaled with two EP's. Otherwise we can send // 5 EP bins with the full predmode // ToDo: fix comments for VVC - cabac->cur_ctx = &(cabac->ctx.intra_luma_mpm_flag_model); - for (int j = 0; j < num_pred_units; ++j) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j); - const cu_info_t* cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); + const cu_info_t* cur_pu = cur_cu; // kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - const cu_info_t* left_pu = NULL; - const cu_info_t* above_pu = NULL; + const cu_info_t* left_pu = NULL; + const cu_info_t* above_pu = NULL; - if (pu_x > 0) { - assert(pu_x >> 2 > 0); - left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1); - } - // Don't take the above PU across the LCU boundary. - if (pu_y % LCU_WIDTH > 0 && pu_y > 0) { - assert(pu_y >> 2 > 0); - above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1); - } + if (x > 0) { + assert(x >> 2 > 0); + left_pu = lcu ? + LCU_GET_CU_AT_PX( + lcu, + SUB_SCU(x - 1), + SUB_SCU(y + cu_width - 1)) : + kvz_cu_array_at_const( + frame->cu_array, + x - 1, + y + cu_width - 1); + } + // Don't take the above PU across the LCU boundary. + if (y % LCU_WIDTH > 0 && y > 0) { + assert(y >> 2 > 0); + above_pu = lcu ? + LCU_GET_CU_AT_PX( + lcu, + SUB_SCU(x + cu_width - 1), + SUB_SCU(y -1)) : + kvz_cu_array_at_const( + frame->cu_array, + x + cu_width - 1, + y - 1); + } + + kvz_intra_get_dir_luma_predictor(x, y, + intra_preds, + cur_pu, + left_pu, above_pu); + intra_pred_mode_actual = cur_pu->intra.mode; - kvz_intra_get_dir_luma_predictor(pu_x, pu_y, - intra_preds[j], - cur_pu, - left_pu, above_pu); - - - intra_pred_mode_actual[j] = cur_pu->intra.mode; - - for (int i = 0; i < INTRA_MPM_COUNT; i++) { - if (intra_preds[j][i] == intra_pred_mode[j]) { - mpm_preds[j] = (int8_t)i; - break; - } - } - // Is the mode in the MPM array or not - flag[j] = (mpm_preds[j] == -1) ? 0 : 1; - if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) { - CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag"); - } - } - - for (int j = 0; j < num_pred_units; ++j) { - // TODO: this loop is unnecessary in VVC. Remove in future - assert(j == 0 && "In VVC this loop should be run only once."); - - // Signal index of the prediction mode in the prediction list, if it is there - if (flag[j]) { - - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j); - const cu_info_t* cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - cabac->cur_ctx = &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]); - if (cur_pu->intra.multi_ref_idx == 0) { - CABAC_BIN(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx_luma_planar"); - } - //CABAC_BIN_EP(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx"); - if (mpm_preds[j] > 0) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 1 ? 1 : 0), "mpm_idx"); - } - if (mpm_preds[j] > 1) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 2 ? 1 : 0), "mpm_idx"); - } - if (mpm_preds[j] > 2) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 3 ? 1 : 0), "mpm_idx"); - } - if (mpm_preds[j] > 3) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 4 ? 1 : 0), "mpm_idx"); - } - } - else { - // Signal the actual prediction mode. - int32_t tmp_pred = intra_pred_mode[j]; - - uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2]; - memcpy(intra_preds_temp, intra_preds[j], sizeof(int8_t) * 3); - memcpy(intra_preds_temp + 4, &intra_preds[j][3], sizeof(int8_t) * 3); - intra_preds_temp[3] = 255; - intra_preds_temp[7] = 255; - - // Improvised merge sort - // Sort prediction list from lowest to highest. - if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t); - if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t); - if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t); - - if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t); - if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t); - if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t); - - // Merge two subarrays - int32_t array1 = 0; - int32_t array2 = 4; - for (int item = 0; item < INTRA_MPM_COUNT; item++) { - if (intra_preds_temp[array1] < intra_preds_temp[array2]) { - intra_preds[j][item] = intra_preds_temp[array1]; - array1++; - } - else { - intra_preds[j][item] = intra_preds_temp[array2]; - array2++; - } - } - - // Reduce the index of the signaled prediction mode according to the - // prediction list, as it has been already signaled that it's not one - // of the prediction modes. - for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) { - if (tmp_pred > intra_preds[j][i]) { - tmp_pred--; - } - } - - kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT); - } - if (cabac->only_count && bits_out) *bits_out += 5; + for (int i = 0; i < INTRA_MPM_COUNT; i++) { + if (intra_preds[i] == *intra_pred_mode) { + mpm_preds = (int8_t)i; + break; } } + // Is the mode in the MPM array or not + flag = (mpm_preds == -1) ? 0 : 1; + if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "prev_intra_luma_pred_flag"); + } + + // Signal index of the prediction mode in the prediction list, if it is there + if (flag) { + + const cu_info_t* cur_pu = cur_cu; + if (cur_pu->intra.multi_ref_idx == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]), (mpm_preds > 0 ? 1 : 0), bits, "mpm_idx_luma_planar"); + } + + if (mpm_preds > 0) { + CABAC_BIN_EP(cabac, (mpm_preds > 1 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + if (mpm_preds > 1) { + CABAC_BIN_EP(cabac, (mpm_preds > 2 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + if (mpm_preds > 2) { + CABAC_BIN_EP(cabac, (mpm_preds > 3 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + if (mpm_preds > 3) { + CABAC_BIN_EP(cabac, (mpm_preds > 4 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + } + else { + // Signal the actual prediction mode. + int32_t tmp_pred = *intra_pred_mode; + + uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2]; + memcpy(intra_preds_temp, intra_preds, sizeof(int8_t) * 3); + memcpy(intra_preds_temp + 4, &intra_preds[3], sizeof(int8_t) * 3); + intra_preds_temp[3] = 255; + intra_preds_temp[7] = 255; + + // Improvised merge sort + // Sort prediction list from lowest to highest. + if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t); + if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t); + if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t); + + if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t); + if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t); + if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t); + + // Merge two subarrays + int32_t array1 = 0; + int32_t array2 = 4; + for (int item = 0; item < INTRA_MPM_COUNT; item++) { + if (intra_preds_temp[array1] < intra_preds_temp[array2]) { + intra_preds[item] = intra_preds_temp[array1]; + array1++; + } + else { + intra_preds[item] = intra_preds_temp[array2]; + array2++; + } + } + + // Reduce the index of the signaled prediction mode according to the + // prediction list, as it has been already signaled that it's not one + // of the prediction modes. + for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) { + if (tmp_pred > intra_preds[i]) { + tmp_pred--; + } + } + + kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT, bits_out); + } + if (cabac->only_count && bits_out) *bits_out += bits; } /** @@ -1493,7 +1491,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } else if (cur_cu->type == CU_INTRA) { - encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, coeff, NULL); + encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); // Code chroma prediction mode. if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) { @@ -1638,7 +1636,7 @@ double kvz_mock_encode_coding_unit( } } else if (cur_cu->type == CU_INTRA) { - encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, NULL, &bits); + encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); } return bits; } diff --git a/src/search_intra.c b/src/search_intra.c index 2f7d765c..f700a165 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -900,7 +900,7 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const if (mip_flag) { // Write MIP transpose flag & mode CABAC_BIN_EP(cabac, is_transposed, "mip_transposed"); - kvz_cabac_encode_trunc_bin(cabac, mip_mode, transp_off); + kvz_cabac_encode_trunc_bin(cabac, mip_mode, transp_off, NULL); } // Write is done. Get bit cost out of cabac From b321015426fb8a8e690d88fa3206797042122d10 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 24 Mar 2022 14:49:05 +0200 Subject: [PATCH 089/135] Use kvz_luma_mode_bits to count luma_mode_bits --- src/cabac.c | 4 +- src/encode_coding_tree.c | 8 +-- src/encode_coding_tree.h | 5 ++ src/search.c | 16 +---- src/search_intra.c | 139 +++++++++++---------------------------- src/search_intra.h | 3 +- 6 files changed, 50 insertions(+), 125 deletions(-) diff --git a/src/cabac.c b/src/cabac.c index dd8aeffc..7bb73ea5 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -220,11 +220,11 @@ void kvz_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_va int b = max_value - val; if (symbol < val - b) { CABAC_BINS_EP(data, symbol, thresh, "TruncSymbols"); - if (bits_out) *bits_out += 1; + if (bits_out) *bits_out += thresh; } else { symbol += val - b; CABAC_BINS_EP(data, symbol, thresh + 1, "TruncSymbols"); - if (bits_out) *bits_out += 1; + if (bits_out) *bits_out += thresh + 1; } } diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 12d8a98b..d7cf5efb 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -814,10 +814,10 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c } } -static void encode_intra_luma_coding_unit(encoder_state_t * const state, +void kvz_encode_intra_luma_coding_unit(const encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth, lcu_t* lcu, double* bits_out) + int x, int y, int depth, const lcu_t* lcu, double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual; @@ -1491,7 +1491,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } else if (cur_cu->type == CU_INTRA) { - encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); + kvz_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); // Code chroma prediction mode. if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) { @@ -1636,7 +1636,7 @@ double kvz_mock_encode_coding_unit( } } else if (cur_cu->type == CU_INTRA) { - encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); + kvz_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); } return bits; } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 24f2759d..23d25af5 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -73,6 +73,11 @@ int kvz_encode_inter_prediction_unit(encoder_state_t* const state, lcu_t* lcu, double* bits_out); +void kvz_encode_intra_luma_coding_unit(const encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int depth, const lcu_t* lcu, double* bits_out); + void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, diff --git a/src/search.c b/src/search.c index da81b0fc..5c409b3f 100644 --- a/src/search.c +++ b/src/search.c @@ -714,23 +714,9 @@ static double calc_mode_bits(const encoder_state_t *state, const cu_info_t * cur_cu, int x, int y, int depth) { - int x_local = SUB_SCU(x); - int y_local = SUB_SCU(y); - assert(cur_cu->type == CU_INTRA); - int8_t candidate_modes[INTRA_MPM_COUNT]; - { - const cu_info_t *left_cu = ((x >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local - SCU_WIDTH, y_local) : NULL); - const cu_info_t *above_cu = ((y >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local, y_local - SCU_WIDTH) : NULL); - kvz_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu); - } - - int width = LCU_WIDTH >> depth; - int height = width; // TODO: height for non-square blocks - int num_mip_modes_half = NUM_MIP_MODES_HALF(width, height); - int mip_flag_ctx_id = kvz_get_mip_flag_context(x, y, width, height, lcu, NULL); - double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, num_mip_modes_half, mip_flag_ctx_id); + double mode_bits = kvz_luma_mode_bits(state, cur_cu, x, y, depth, lcu); if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != KVZ_CSP_400) { mode_bits += kvz_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode); diff --git a/src/search_intra.c b/src/search_intra.c index f700a165..4eee0d64 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" #include "encoderstate.h" +#include "encode_coding_tree.h" #include "image.h" #include "intra.h" #include "kvazaar.h" @@ -692,12 +693,35 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Add prediction mode coding cost as the last thing. We don't want this // affecting the halving search. + const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1); + const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0); + const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1); + const double not_planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 0); for (int mode_i = 0; mode_i < modes_selected; ++mode_i) { - costs[mode_i] += state->lambda_sqrt * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0); + int i = 0; + int smaller_than_pred = 0; + double bits; + for(; i < INTRA_MPM_COUNT; i++) { + if (intra_preds[i] == mode_i) { + break; + } + if(mode_i > intra_preds[i]) { + smaller_than_pred += 1; + } + } + if (i == 0) { + bits = planar_mode_flag + mpm_mode_bit; + } + else if (i < INTRA_MPM_COUNT) { + bits = not_planar_mode_flag + mpm_mode_bit + MAX(i, 3); + } + else { + bits = not_mpm_mode_bit + 5 + (mode_i - smaller_than_pred > 3); + } + costs[mode_i] += state->lambda_sqrt * bits; } #undef PARALLEL_BLKS - return modes_selected; } @@ -774,7 +798,6 @@ static int8_t search_intra_rdo(encoder_state_t * const state, for (int mip = 0; mip <= 1; mip++) { const int transp_off = mip ? num_mip_modes_full >> 1 : 0; - uint8_t ctx_id = mip ? kvz_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL) : 0; uint8_t multi_ref_index = mip ? 0 : multi_ref_idx; int *num_modes = mip ? &num_mip_modes_full : &modes_to_check; @@ -782,9 +805,6 @@ static int8_t search_intra_rdo(encoder_state_t * const state, int8_t mode = mip ? mip_modes[i] : modes[i]; double *mode_cost_p = mip ? &mip_costs[i] : &costs[i]; int8_t *mode_trafo_p = mip ? &mip_trafo[i] : &trafo[i]; - int rdo_bitcost = kvz_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id); - - *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5); // Mip related stuff // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream. @@ -806,6 +826,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.joint_cb_cr = 0; FILL(pred_cu.cbf, 0); + int rdo_bitcost = kvz_luma_mode_bits(state, &pred_cu, x_px, y_px, depth, lcu); + *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5); + // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); @@ -867,105 +890,17 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } -double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id) +double kvz_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu) { cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; double mode_bits = 0; - - bool enable_mip = state->encoder_control->cfg.mip; - bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false; - - // Mip flag cost must be calculated even if mip is not used in this block - if (enable_mip) { - // Make a copy of state->cabac for bit cost estimation. - cabac_data_t state_cabac_copy; - cabac_data_t* cabac; - memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t)); - // Clear data and set mode to count only - state_cabac_copy.only_count = 1; - state_cabac_copy.num_buffered_bytes = 0; - state_cabac_copy.bits_left = 23; - - cabac = &state_cabac_copy; - - // Do cabac writes as normal - const int transp_off = num_mip_modes_half; - const bool is_transposed = luma_mode >= transp_off ? true : false; - int8_t mip_mode = is_transposed ? luma_mode - transp_off : luma_mode; - - // Write MIP flag - cabac->cur_ctx = &(cabac->ctx.mip_flag[mip_flag_ctx_id]); - CABAC_BIN(cabac, mip_flag, "mip_flag"); - - if (mip_flag) { - // Write MIP transpose flag & mode - CABAC_BIN_EP(cabac, is_transposed, "mip_transposed"); - kvz_cabac_encode_trunc_bin(cabac, mip_mode, transp_off, NULL); - } - - // Write is done. Get bit cost out of cabac - mode_bits += (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); - } - - if (!mip_flag) { - int8_t mode_in_preds = -1; - for (int i = 0; i < INTRA_MPM_COUNT; ++i) { - if (luma_mode == intra_preds[i]) { - mode_in_preds = i; - break; - } - } - cabac_ctx_t* ctx = &(cabac->ctx.luma_planar_model[1]); - CABAC_FBITS_UPDATE( - cabac, - ctx, - mode_in_preds != -1, - mode_bits, - "prev_intra_luma_pred_flag_search"); - if (state->search_cabac.update) { - if (mode_in_preds) { - CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); - if (luma_mode != intra_preds[0]) { - CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx"); - } - } else { - // This value should be transformed for actual coding, - // but here the value does not actually matter, just that we write 5 bits - CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode"); - } - } - - bool enable_mrl = state->encoder_control->cfg.mrl; - uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0; - - ctx = &(cabac->ctx.intra_luma_mpm_flag_model); - - if (multi_ref_index == 0) { - mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1); - } - - // Add MRL bits. - if (enable_mrl && MAX_REF_LINE_IDX > 1) { - ctx = &(cabac->ctx.multi_ref_line[0]); - mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0); - - if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) { - ctx = &(cabac->ctx.multi_ref_line[1]); - mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1); - } - } - - if (mode_in_preds != -1 || multi_ref_index != 0) { - ctx = &(cabac->ctx.luma_planar_model[0]); - if (multi_ref_index == 0) { - mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0); - } - mode_bits += MIN(4.0, mode_in_preds); - } - else { - mode_bits += 6.0; - } - } + cabac_data_t cabac_copy; + memcpy(&cabac_copy, cabac, sizeof cabac_copy); + kvz_encode_intra_luma_coding_unit( + state, + &cabac_copy, cur_cu, + x, y, depth, lcu, &mode_bits + ); return mode_bits; } diff --git a/src/search_intra.h b/src/search_intra.h index 13a830e0..8376889f 100644 --- a/src/search_intra.h +++ b/src/search_intra.h @@ -43,8 +43,7 @@ #include "global.h" // IWYU pragma: keep #include "intra.h" -double kvz_luma_mode_bits(const encoder_state_t *state, - int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id); +double kvz_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu); double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode); From 2050de88913b742ac78fb740816f901d3083a16e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 08:15:30 +0200 Subject: [PATCH 090/135] Fix cclm prediction generation for depth 0 --- src/intra.c | 9 ++++++++- src/search_intra.c | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/intra.c b/src/intra.c index 9976464b..b955302e 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1430,8 +1430,15 @@ static void intra_recon_tb_leaf( } } else { kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width); + if (LCU_GET_CU_AT_PX(lcu, x_scu, y_scu)->depth != depth) { + cclm_parameters_t temp_params; + kvz_predict_cclm( + state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params); + } + else { + linear_transform_cclm(&intra_paramas->cclm_parameters[color == COLOR_U ? 0 : 1], pred, pred, width, width); + } - linear_transform_cclm(&intra_paramas->cclm_parameters[color == COLOR_U ? 0 : 1], pred, pred, width, width); } const int index = lcu_px.x + lcu_px.y * lcu_width; diff --git a/src/search_intra.c b/src/search_intra.c index 4eee0d64..23a6631b 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -1040,6 +1040,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, if(tr_cu->depth != tr_cu->tr_depth) { chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); } else { + chroma.cost = 0; kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &chroma.cost); chroma.jccr = tr_cu->joint_cb_cr; } From 41c9f5b858f84978cbc8549ce266658e43ea17d6 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 09:02:37 +0200 Subject: [PATCH 091/135] Fix undefined behavior --- src/search_intra.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/search_intra.c b/src/search_intra.c index 23a6631b..ffaf3d58 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -818,6 +818,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.depth = depth; pred_cu.type = CU_INTRA; pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks + pred_cu.skipped = 0; + pred_cu.merged = 0; + pred_cu.bdpcmMode = 0; pred_cu.intra.mode = pred_mode; pred_cu.intra.mode_chroma = pred_mode; pred_cu.intra.multi_ref_idx = multi_ref_index; @@ -864,6 +867,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.depth = depth; pred_cu.type = CU_INTRA; pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); + pred_cu.skipped = 0; + pred_cu.merged = 0; + pred_cu.bdpcmMode = 0; if (use_mip) { int transp_off = num_mip_modes_full >> 1; bool is_transposed = (mip_modes[0] >= transp_off ? true : false); From 99ddc209fc6e118b49432ffc3588411c2951baa7 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 10:18:53 +0200 Subject: [PATCH 092/135] Fix cclm when tiles are enabled --- src/intra.c | 9 +++++---- src/search.c | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/intra.c b/src/intra.c index b955302e..f5f255f6 100644 --- a/src/intra.c +++ b/src/intra.c @@ -498,6 +498,7 @@ void kvz_predict_cclm( kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH; + const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); // Essentially what this does is that it uses 6-tap filtering to downsample // the luma intra references down to match the resolution of the chroma channel. @@ -513,7 +514,7 @@ void kvz_predict_cclm( } if(y_scu == 0) { if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4); - memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(kvz_pixel) * (width + available_above_right * 2)); + memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride2 / 2)], sizeof(kvz_pixel) * (width + available_above_right * 2)); } else { for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) { @@ -538,11 +539,11 @@ void kvz_predict_cclm( if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break; } for(int i = 0; i < height + available_left_below * 2; i++) { - sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1]; + sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1]; } } - kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width); + kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride2) / 4], sampled_luma, width, height, stride2 / 2, width); int16_t a, b, shift; get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift); @@ -1402,7 +1403,7 @@ static void intra_recon_tb_leaf( kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index); kvz_pixel pred[32 * 32]; - int stride = state->tile->frame->source->stride; + const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); bool use_mip = false; int8_t intra_mode = color == COLOR_Y ? intra_paramas->luma_mode : intra_paramas->chroma_mode; diff --git a/src/search.c b/src/search.c index 5c409b3f..244a2b7e 100644 --- a/src/search.c +++ b/src/search.c @@ -244,7 +244,8 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int x_scu = SUB_SCU(x); int y_scu = SUB_SCU(y); y_rec += x_scu + y_scu * LCU_WIDTH; - int stride = state->tile->frame->source->stride; + const int stride = state->tile->frame->rec->stride; + const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) { for (int x_ = 0; x_ < width; x_++) { @@ -258,13 +259,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, s += y_rec[2 * x_ + LCU_WIDTH] * 2; s += y_rec[2 * x_ + 1 + LCU_WIDTH]; s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH]; - int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2; + int index = x / 2 + x_ + (y / 2 + y_ )* stride2 / 2; state->tile->frame->cclm_luma_rec[index] = s >> 3; } y_rec += LCU_WIDTH * 2; } if((y + height * 2) % 64 == 0) { - int line = y / 64 * stride / 2; + int line = y / 64 * stride2 / 2; y_rec -= LCU_WIDTH; for (int i = 0; i < width; ++i) { int s = 2; From 548f23f7d7616d8b39f398e12873e872cd6d2c33 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 11:07:35 +0200 Subject: [PATCH 093/135] temporarily set tile test to gop 0 --- tests/test_slices.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_slices.sh b/tests/test_slices.sh index 512888b0..a4166036 100755 --- a/tests/test_slices.sh +++ b/tests/test_slices.sh @@ -3,6 +3,6 @@ set -eu . "${0%/*}/util.sh" -valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --tiles=2x2 +valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --gop 0 --tiles=2x2 #valgrind_test 264x130 10 --threads=2 --owf=1 --preset=ultrafast --slices=wpp #if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 --threads=2 --owf=1 --preset=fast --slices=wpp --no-open-gop; fi From eb8663fbd12c70bb3253b3c1a68f38de011d6f3b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 13:00:59 +0200 Subject: [PATCH 094/135] Move split flag coding to its own function --- src/encode_coding_tree.c | 249 ++++++++++++++++++--------------------- 1 file changed, 112 insertions(+), 137 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index d7cf5efb..37002645 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1151,6 +1151,115 @@ static void encode_part_mode(encoder_state_t * const state, } **/ + +static bool write_split_flag(const encoder_state_t * const state, cabac_data_t* cabac, + const cu_info_t * left_cu, const cu_info_t * above_cu, + uint8_t split_flag, + int depth, int cu_width, int x, int y, double* bits_out) +{ + uint16_t abs_x = x + state->tile->offset_x; + uint16_t abs_y = y + state->tile->offset_y; + double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; + // Implisit split flag when on border + // Exception made in VVC with flag not being implicit if the BT can be used for + // horizontal or vertical split, then this flag tells if QT or BT is used + + bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split; + no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true; + if (depth > MAX_DEPTH) allow_qt = false; + // ToDo: update this when btt is actually used + bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH + + + uint8_t implicit_split_mode = KVZ_NO_SPLIT; + //bool implicit_split = border; + bool bottom_left_available = ((abs_y + cu_width - 1) < ctrl->in.height); + bool top_right_available = ((abs_x + cu_width - 1) < ctrl->in.width); + + if (!bottom_left_available && !top_right_available && allow_qt) { + implicit_split_mode = KVZ_QUAD_SPLIT; + } + else if (!bottom_left_available && allow_btt) { + implicit_split_mode = KVZ_HORZ_SPLIT; + } + else if (!top_right_available && allow_btt) { + implicit_split_mode = KVZ_VERT_SPLIT; + } + else if (!bottom_left_available || !top_right_available) { + implicit_split_mode = KVZ_QUAD_SPLIT; + } + + // Check split conditions + if (implicit_split_mode != KVZ_NO_SPLIT) { + no_split = th_split = tv_split = false; + bh_split = (implicit_split_mode == KVZ_HORZ_SPLIT); + bv_split = (implicit_split_mode == KVZ_VERT_SPLIT); + } + + if (!allow_btt) { + bh_split = bv_split = th_split = tv_split = false; + } + + bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split; + + split_flag |= implicit_split_mode != KVZ_NO_SPLIT; + + int split_model = 0; + if (no_split && allow_split) { + // Get left and top block split_flags and if they are present and true, increase model number + // ToDo: should use height and width to increase model, PU_GET_W() ? + if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) { + split_model++; + } + + if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) { + split_model++; + } + + uint32_t split_num = 0; + if (allow_qt) split_num += 2; + if (bh_split) split_num++; + if (bv_split) split_num++; + if (th_split) split_num++; + if (tv_split) split_num++; + + if (split_num > 0) split_num--; + + split_model += 3 * (split_num >> 1); + + cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag, bits, "split_flag"); + } + + bool qt_split = split_flag || implicit_split_mode == KVZ_QUAD_SPLIT; + + if (!(implicit_split_mode == KVZ_NO_SPLIT) && (allow_qt && allow_btt)) { + split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag"); + } + + // Only signal split when it is not implicit, currently only Qt split supported + if (!(implicit_split_mode == KVZ_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) { + + split_model = 0; + + // Get left and top block split_flags and if they are present and true, increase model number + if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { + split_model++; + } + + if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { + split_model++; + } + + split_model += (depth > 2 ? 0 : 3); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), split_flag, bits, "split_cu_mode"); + } + if (bits_out) *bits_out += bits; + return split_flag; +} + void kvz_encode_coding_tree(encoder_state_t * const state, uint16_t x, uint16_t y, @@ -1174,8 +1283,6 @@ void kvz_encode_coding_tree(encoder_state_t * const state, above_cu = kvz_cu_array_at_const((const cu_array_t*)frame->cu_array, x, y - 1); } - uint8_t split_flag = GET_SPLITDATA(cur_cu, depth); - uint8_t split_model = 0; // Absolute coordinates uint16_t abs_x = x + state->tile->offset_x; @@ -1195,116 +1302,8 @@ void kvz_encode_coding_tree(encoder_state_t * const state, // When not in MAX_DEPTH, insert split flag and split the blocks if needed if (depth != MAX_DEPTH) { - // Implisit split flag when on border - // Exception made in VVC with flag not being implicit if the BT can be used for - // horizontal or vertical split, then this flag tells if QT or BT is used - - bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split; - no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true; - if(depth > MAX_DEPTH) allow_qt = false; - // ToDo: update this when btt is actually used - bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH - + const int split_flag = write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, NULL); - - uint8_t implicit_split_mode = KVZ_NO_SPLIT; - //bool implicit_split = border; - bool bottom_left_available = ((abs_y + cu_width - 1) < ctrl->in.height); - bool top_right_available = ((abs_x + cu_width - 1) < ctrl->in.width); - - /* - if((depth >= 1 && (border_x != border_y))) implicit_split = false; - if (state->frame->slicetype != KVZ_SLICE_I) { - if (border_x != border_y) implicit_split = false; - if (!bottom_left_available && top_right_available) implicit_split = false; - if (!top_right_available && bottom_left_available) implicit_split = false; - } - */ - - - if (!bottom_left_available && !top_right_available && allow_qt) { - implicit_split_mode = KVZ_QUAD_SPLIT; - } else if (!bottom_left_available && allow_btt) { - implicit_split_mode = KVZ_HORZ_SPLIT; - } else if (!top_right_available && allow_btt) { - implicit_split_mode = KVZ_VERT_SPLIT; - } else if (!bottom_left_available || !top_right_available) { - implicit_split_mode = KVZ_QUAD_SPLIT; - } - - //split_flag = implicit_split_mode != KVZ_NO_SPLIT; - - // Check split conditions - if (implicit_split_mode != KVZ_NO_SPLIT) { - no_split = th_split = tv_split = false; - bh_split = (implicit_split_mode == KVZ_HORZ_SPLIT); - bv_split = (implicit_split_mode == KVZ_VERT_SPLIT); - } - - if (!allow_btt) { - bh_split = bv_split = th_split = tv_split = false; - } - - bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split; - - split_flag |= implicit_split_mode != KVZ_NO_SPLIT; - - if (no_split && allow_split) { - split_model = 0; - - // Get left and top block split_flags and if they are present and true, increase model number - // ToDo: should use height and width to increase model, PU_GET_W() ? - if (left_cu && PU_GET_H(left_cu->part_size,LCU_WIDTH>>left_cu->depth,0) < LCU_WIDTH>>depth) { - split_model++; - } - - if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) { - split_model++; - } - - uint32_t split_num = 0; - if (allow_qt) split_num+=2; - if (bh_split) split_num++; - if (bv_split) split_num++; - if (th_split) split_num++; - if (tv_split) split_num++; - - if (split_num > 0) split_num--; - - split_model += 3 * (split_num >> 1); - - cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]); - CABAC_BIN(cabac, split_flag, "SplitFlag"); - //fprintf(stdout, "split_model=%d %d / %d / %d / %d / %d\n", split_model, allow_qt, bh_split, bv_split, th_split, tv_split); - } - - bool qt_split = split_flag || implicit_split_mode == KVZ_QUAD_SPLIT; - - if (!(implicit_split_mode == KVZ_NO_SPLIT) && (allow_qt && allow_btt)) { - split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3); - cabac->cur_ctx = &(cabac->ctx.qt_split_flag_model[split_model]); - CABAC_BIN(cabac, qt_split, "QT_SplitFlag"); - } - - // Only signal split when it is not implicit, currently only Qt split supported - if (!(implicit_split_mode == KVZ_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) { - - split_model = 0; - - // Get left and top block split_flags and if they are present and true, increase model number - if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { - split_model++; - } - - if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { - split_model++; - } - split_model += (depth > 2 ? 0 : 3); - - cabac->cur_ctx = &(cabac->ctx.qt_split_flag_model[split_model]); - CABAC_BIN(cabac, split_flag, "split_cu_mode"); - } - if (split_flag || border) { // Split blocks and remember to change x and y block positions kvz_encode_coding_tree(state, x, y, depth + 1, coeff); @@ -1530,7 +1529,6 @@ double kvz_mock_encode_coding_unit( int x, int y, int depth, lcu_t* lcu, cu_info_t* cur_cu) { double bits = 0; - const encoder_control_t* const ctrl = state->encoder_control; int x_local = SUB_SCU(x); int y_local = SUB_SCU(y); @@ -1544,37 +1542,14 @@ double kvz_mock_encode_coding_unit( if (y) { above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1); } - uint8_t split_model = 0; - - // Absolute coordinates - uint16_t abs_x = x + state->tile->offset_x; - uint16_t abs_y = y + state->tile->offset_y; - - // Check for slice border - bool border_x = ctrl->in.width < abs_x + cu_width; - bool border_y = ctrl->in.height < abs_y + cu_width; - bool border = border_x || border_y; /*!< are we in any border CU */ - + if (depth <= state->frame->max_qp_delta_depth) { state->must_code_qp_delta = true; } // When not in MAX_DEPTH, insert split flag and split the blocks if needed if (depth != MAX_DEPTH) { - // Implicit split flag when on border - if (!border) { - // Get left and top block split_flags and if they are present and true, increase model number - if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { - split_model++; - } - - if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { - split_model++; - } - - // This mocks encoding the current CU so it should be never split - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), 0, bits, "SplitFlag"); - } + write_split_flag(state, cabac, left_cu, above_cu, 0, depth, cu_width, x, y, &bits); } // Encode skip flag From e1fcd8b2e450331d5db964a295489285763e2f50 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 13:27:34 +0200 Subject: [PATCH 095/135] Add missing bits to kvz_mock_encode_coding_unit --- src/encode_coding_tree.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 37002645..3ec12be6 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1529,6 +1529,7 @@ double kvz_mock_encode_coding_unit( int x, int y, int depth, lcu_t* lcu, cu_info_t* cur_cu) { double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; int x_local = SUB_SCU(x); int y_local = SUB_SCU(y); @@ -1553,7 +1554,7 @@ double kvz_mock_encode_coding_unit( } // Encode skip flag - if (state->frame->slicetype != KVZ_SLICE_I) { + if (state->frame->slicetype != KVZ_SLICE_I && cu_width != 4) { int8_t ctx_skip = 0; if (left_cu && left_cu->skipped) { @@ -1598,21 +1599,24 @@ double kvz_mock_encode_coding_unit( } if (cur_cu->type == CU_INTER) { - const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; - - for (int i = 0; i < num_pu; ++i) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); - const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); - const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); - const cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); - - kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits); + const uint8_t imv_mode = KVZ_IMV_OFF; + const int non_zero_mvd = kvz_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits); + if (ctrl->cfg.amvr && non_zero_mvd) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag"); + if (imv_mode > KVZ_IMV_OFF) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[4]), imv_mode, bits, "imv_flag"); + if (imv_mode < KVZ_IMV_HPEL) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[1]), imv_mode, bits, "imv_flag"); // 1 indicates 4PEL, 0 FPEL + } + } } } else if (cur_cu->type == CU_INTRA) { kvz_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); } + else { + assert(0 && "Unset cu type"); + } return bits; } From 0d7800c7f1926e06eab0f60e573ea2bc1e6b7138 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 13:54:37 +0200 Subject: [PATCH 096/135] Use kvz_write_split_flag for all split flags during search --- src/encode_coding_tree.c | 6 +++--- src/encode_coding_tree.h | 6 ++++++ src/search.c | 34 ++++++++-------------------------- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 3ec12be6..b611b7aa 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1152,7 +1152,7 @@ static void encode_part_mode(encoder_state_t * const state, **/ -static bool write_split_flag(const encoder_state_t * const state, cabac_data_t* cabac, +bool kvz_write_split_flag(const encoder_state_t * const state, cabac_data_t* cabac, const cu_info_t * left_cu, const cu_info_t * above_cu, uint8_t split_flag, int depth, int cu_width, int x, int y, double* bits_out) @@ -1302,7 +1302,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, // When not in MAX_DEPTH, insert split flag and split the blocks if needed if (depth != MAX_DEPTH) { - const int split_flag = write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, NULL); + const int split_flag = kvz_write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, NULL); if (split_flag || border) { // Split blocks and remember to change x and y block positions @@ -1550,7 +1550,7 @@ double kvz_mock_encode_coding_unit( // When not in MAX_DEPTH, insert split flag and split the blocks if needed if (depth != MAX_DEPTH) { - write_split_flag(state, cabac, left_cu, above_cu, 0, depth, cu_width, x, y, &bits); + kvz_write_split_flag(state, cabac, left_cu, above_cu, 0, depth, cu_width, x, y, &bits); } // Encode skip flag diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 23d25af5..2091cff5 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -78,6 +78,12 @@ void kvz_encode_intra_luma_coding_unit(const encoder_state_t* const state, const cu_info_t* const cur_cu, int x, int y, int depth, const lcu_t* lcu, double* bits_out); + +bool kvz_write_split_flag(const encoder_state_t* const state, cabac_data_t* cabac, + const cu_info_t* left_cu, const cu_info_t* above_cu, + uint8_t split_flag, + int depth, int cu_width, int x, int y, double* bits_out); + void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, diff --git a/src/search.c b/src/search.c index 244a2b7e..a681a228 100644 --- a/src/search.c +++ b/src/search.c @@ -796,14 +796,6 @@ void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map) } -static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) -{ - vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) }; - bool condA = x >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x - 1, lcu_cu.y )->depth > depth; - bool condL = y >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x, lcu_cu.y - 1)->depth > depth; - return condA + condL; -} - /** * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode. * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH. @@ -1152,16 +1144,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, split_bits, "split_search"); + kvz_write_split_flag(state, &state->search_cabac, + x > 0 ? LCU_GET_CU_AT_PX(lcu, x -1, y ): NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, x, y - 1) : NULL, + 1, depth, cu_width, x, y, &split_bits); } - if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { - // Add cost of intra part_size. - cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); - } state->search_cabac.update = 0; split_cost += split_bits * state->lambda; @@ -1198,16 +1186,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; double bits = 0; - if (depth < MAX_DEPTH) { - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); - } - else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { - // Add cost of intra part_size. - cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); - CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, bits, "no_split_search"); - } + kvz_write_split_flag(state, &state->search_cabac, + x > 0 ? LCU_GET_CU_AT_PX(lcu, x - 1, y) : NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, x, y - 1) : NULL, + 0, depth, cu_width, x, y, & split_bits); cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; From 84e4fe67903767991659971620dd1f2ce9cce00c Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 25 Mar 2022 15:27:34 +0200 Subject: [PATCH 097/135] Fix broken things --- src/search.c | 8 ++++---- src/search_inter.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/search.c b/src/search.c index a681a228..9f5e1406 100644 --- a/src/search.c +++ b/src/search.c @@ -1145,8 +1145,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. kvz_write_split_flag(state, &state->search_cabac, - x > 0 ? LCU_GET_CU_AT_PX(lcu, x -1, y ): NULL, - y > 0 ? LCU_GET_CU_AT_PX(lcu, x, y - 1) : NULL, + x > 0 ? LCU_GET_CU_AT_PX(lcu,SUB_SCU(x) -1, SUB_SCU(y)): NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, 1, depth, cu_width, x, y, &split_bits); } @@ -1187,8 +1187,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cost = 0; double bits = 0; kvz_write_split_flag(state, &state->search_cabac, - x > 0 ? LCU_GET_CU_AT_PX(lcu, x - 1, y) : NULL, - y > 0 ? LCU_GET_CU_AT_PX(lcu, x, y - 1) : NULL, + x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, 0, depth, cu_width, x, y, & split_bits); cur_cu->intra = cu_d1->intra; diff --git a/src/search_inter.c b/src/search_inter.c index c2203b72..1ec6ecbf 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2066,7 +2066,7 @@ static void search_pu_inter(encoder_state_t * const state, assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); kvz_sort_keys_by_cost(&amvp[2]); - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } From d531d77b75843f6772cb3487e4f065a53d6d13fc Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 28 Mar 2022 10:21:15 +0300 Subject: [PATCH 098/135] fix chroma 4x4 --- src/search_intra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_intra.c b/src/search_intra.c index ffaf3d58..a8c08a6c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -376,7 +376,7 @@ static double search_intra_trdepth(encoder_state_t * const state, intra_parameters.chroma_mode = chroma_mode; intra_parameters.jccr = -1; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently kvz_intra_recon_cu(state, - x_px, y_px, + x_px & ~7, y_px & ~7, depth, &intra_parameters, pred_cu, From 5cd0554a7dd300abaf165fddccc3aa9743297689 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 28 Mar 2022 14:08:35 +0300 Subject: [PATCH 099/135] fix correct chroma cost calculation --- src/search.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.c b/src/search.c index 9f5e1406..2fa6ce45 100644 --- a/src/search.c +++ b/src/search.c @@ -563,7 +563,7 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); - if(pred_cu->joint_cb_cr != 0) { + if(pred_cu->joint_cb_cr == 0) { if (!state->encoder_control->cfg.lossless) { int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], From 7ef25a23a9424d1c36104c11f4e348aad6cfcbf7 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 29 Mar 2022 09:27:56 +0300 Subject: [PATCH 100/135] Fix chroma cost calculation --- src/search.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/search.c b/src/search.c index 2fa6ce45..0a9c2b78 100644 --- a/src/search.c +++ b/src/search.c @@ -558,9 +558,9 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, } unsigned chroma_ssd = 0; - if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) { - const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; - const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + if(state->encoder_control->chroma_format != KVZ_CSP_400 && (depth != 4 || (x_px % 8 != 0 && y_px % 8 != 0))) { + const vector2d_t lcu_px = { (x_px & ~7 ) / 2, (y_px & ~7) / 2 }; + const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1)); int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); if(pred_cu->joint_cb_cr == 0) { From df5bc0db4e76680414f423c5b9360e77be3e94a6 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 30 Mar 2022 10:10:02 +0300 Subject: [PATCH 101/135] Fix cclm when 64x64 is enabled --- src/search.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/search.c b/src/search.c index 0a9c2b78..0d89ab67 100644 --- a/src/search.c +++ b/src/search.c @@ -1253,6 +1253,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, kvz_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu); } } + else { + downsample_cclm_rec( + state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] + ); + } } else if (depth >= 0 && depth < MAX_PU_DEPTH) { // Need to copy modes down since the lower level of the work tree is used // when searching SMP and AMP blocks. From 2bf16c2010d79bb531589ebf66ea5212694d6b5e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 30 Mar 2022 11:00:17 +0300 Subject: [PATCH 102/135] Fix correct precision for all motion vectors --- src/search_inter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 1ec6ecbf..b1e42f35 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -472,7 +472,7 @@ static bool early_terminate(inter_search_info_t *info, { 0, -1 }, { -1, 0 }, { 0, 0 }, }; - vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; int first_index = 0; int last_index = 3; @@ -668,7 +668,7 @@ static void tz_search(inter_search_info_t *info, int best_dist = 0; - vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 }; + vector2d_t start = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; // step 2, grid search int rounds_without_improvement = 0; @@ -768,7 +768,7 @@ static void hexagon_search(inter_search_info_t *info, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. int best_index = 0; From c777acd91e2e3c24fd01664f65f9a05c1667af1b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 30 Mar 2022 15:06:44 +0300 Subject: [PATCH 103/135] correct calculation of mvd bitcost --- src/search_inter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index b1e42f35..22c633b4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -350,20 +350,18 @@ static double get_mvd_coding_cost(const encoder_state_t* state, if (hor_abs_gr0) { if (mvd_hor_abs > 1) { - bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs - 2) << CTX_FRAC_BITS; + bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs - 2); } - bitcost += CTX_FRAC_ONE_BIT; + bitcost += 1; } if (ver_abs_gr0) { if (mvd_ver_abs > 1) { - bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs - 2) << CTX_FRAC_BITS; + bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs - 2) ; } - bitcost += CTX_FRAC_ONE_BIT; + bitcost += 1; } - - - // Round and shift back to integer bits. - return bitcost / (1 << CTX_FRAC_BITS); + + return bitcost; } From 45a9bed7fb7a9f9488106980963c61333f9152f2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 31 Mar 2022 10:30:02 +0300 Subject: [PATCH 104/135] Don't count jccr bits when jccr is disabled --- src/search_inter.c | 2 +- src/search_intra.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 22c633b4..5135c517 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2172,7 +2172,7 @@ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, if(cbf) { *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); if (reconstruct_chroma) { - if (cur_cu->depth != cur_cu->tr_depth) { + if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) { *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); } else { diff --git a/src/search_intra.c b/src/search_intra.c index a8c08a6c..8a6e24b1 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -1043,7 +1043,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, lcu); } double bits = 0; - if(tr_cu->depth != tr_cu->tr_depth) { + if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) { chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); } else { chroma.cost = 0; From f5753394a0487f514bd21f1184b84fe14fc979cd Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 31 Mar 2022 11:30:27 +0300 Subject: [PATCH 105/135] Update intra chroma mode contexts --- src/encode_coding_tree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index b611b7aa..6912787d 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -732,14 +732,11 @@ int kvz_encode_inter_prediction_unit(encoder_state_t * const state, return non_zero_mvd; } -static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width, const int cclm_enabled) { +static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, const int cclm_enabled) { unsigned pred_mode = 0; unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83}; - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, 0); - const cu_info_t *first_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - int8_t chroma_intra_dir = first_pu->intra.mode_chroma; - int8_t luma_intra_dir = first_pu->intra.mode; + int8_t chroma_intra_dir = cur_cu->intra.mode_chroma; + int8_t luma_intra_dir = cur_cu->intra.mode; bool derived_mode = chroma_intra_dir == luma_intra_dir; @@ -1494,7 +1491,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, // Code chroma prediction mode. if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); } encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); @@ -1503,7 +1500,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, // For 4x4 the chroma PU/TU is coded after the last if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); } @@ -1613,6 +1610,9 @@ double kvz_mock_encode_coding_unit( } else if (cur_cu->type == CU_INTRA) { kvz_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); + if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != KVZ_CSP_400) { + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); + } } else { assert(0 && "Unset cu type"); From 03fd8f1545862d154efe97a0140f9368c1674e0f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 31 Mar 2022 11:45:49 +0300 Subject: [PATCH 106/135] Don't write jccr flag for inter CUs when it cannot be enabled --- src/encode_coding_tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 6912787d..f31c950d 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -594,7 +594,13 @@ static void encode_transform_coeff(encoder_state_t * const state, state->must_code_qp_delta = false; } - if((cb_flag_u || cb_flag_v ) && (depth != 4 || only_chroma) && state->encoder_control->cfg.jccr) { + if(( + ((cb_flag_u || cb_flag_v ) + && cur_cu->type == CU_INTRA) + || (cb_flag_u && cb_flag_v)) + && (depth != 4 || only_chroma) + && state->encoder_control->cfg.jccr + ) { cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1]; CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag"); } From 57cd9bd97e293243ab78ca6d71fabf7020fdc9af Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 31 Mar 2022 12:11:33 +0300 Subject: [PATCH 107/135] minor fixes --- src/intra.c | 4 ++-- src/search_intra.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intra.c b/src/intra.c index f5f255f6..e3606eb9 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1429,7 +1429,8 @@ static void intra_recon_tb_leaf( else { kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index); } - } else { + } + else { kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width); if (LCU_GET_CU_AT_PX(lcu, x_scu, y_scu)->depth != depth) { cclm_parameters_t temp_params; @@ -1439,7 +1440,6 @@ static void intra_recon_tb_leaf( else { linear_transform_cclm(&intra_paramas->cclm_parameters[color == COLOR_U ? 0 : 1], pred, pred, width, width); } - } const int index = lcu_px.x + lcu_px.y * lcu_width; diff --git a/src/search_intra.c b/src/search_intra.c index 8a6e24b1..d896c091 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -829,8 +829,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, pred_cu.joint_cb_cr = 0; FILL(pred_cu.cbf, 0); - int rdo_bitcost = kvz_luma_mode_bits(state, &pred_cu, x_px, y_px, depth, lcu); - *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5); + double rdo_bitcost = kvz_luma_mode_bits(state, &pred_cu, x_px, y_px, depth, lcu); + *mode_cost_p = rdo_bitcost * state->lambda; // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); From 6924497bae4890e69733912bdcbbe39eac644e81 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 31 Mar 2022 14:59:22 +0300 Subject: [PATCH 108/135] Fix jccr when inter is used --- src/inter.c | 10 +++++++++- src/search.c | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/inter.c b/src/inter.c index 44ac599f..177428e1 100644 --- a/src/inter.c +++ b/src/inter.c @@ -624,7 +624,9 @@ void kvz_inter_pred_pu(const encoder_state_t * const state, int i_pu) { - cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + const int x_scu = SUB_SCU(x); + const int y_scu = SUB_SCU(y); + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu); const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu); const int pu_w = PU_GET_W(cu->part_size, width, i_pu); @@ -673,6 +675,12 @@ void kvz_inter_pred_pu(const encoder_state_t * const state, NULL, predict_luma, predict_chroma); } + + if (predict_chroma && state->encoder_control->cfg.jccr) { + const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + kvz_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + kvz_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + } } /** diff --git a/src/search.c b/src/search.c index 0d89ab67..12f75832 100644 --- a/src/search.c +++ b/src/search.c @@ -1059,6 +1059,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, NULL, lcu, false); + if (cur_cu->depth == cur_cu->tr_depth && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr) { + kvz_select_jccr_mode(state, + x & ~7, y & ~7, + depth, + NULL, + lcu, + NULL); + } int cbf = cbf_is_set_any(cur_cu->cbf, depth); From 3532f7a93839bc54f0134d3d46d2de45e5e4838d Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 31 Mar 2022 15:19:32 +0300 Subject: [PATCH 109/135] Correct counting of jccr flag for inter pu rdo --- src/search.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.c b/src/search.c index 12f75832..1ab13841 100644 --- a/src/search.c +++ b/src/search.c @@ -638,7 +638,7 @@ void kvz_select_jccr_mode( CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search"); int cbf_mask = cbf_is_set(pred_cu->cbf, depth, COLOR_U) * 2 + cbf_is_set(pred_cu->cbf, depth, COLOR_V) - 1; - if(cbf_mask != -1) + if((cbf_mask != -1 && pred_cu->type == CU_INTRA) || cbf_mask == 2) CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag"); if(pred_cu->joint_cb_cr) { From 863929378fba09518bc03d128411c67e6dd9a68b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 1 Apr 2022 15:15:29 +0300 Subject: [PATCH 110/135] Better get_mvd_coding_cost --- src/search_inter.c | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/src/search_inter.c b/src/search_inter.c index 5135c517..4a1a13ef 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -331,37 +331,16 @@ static double get_mvd_coding_cost(const encoder_state_t* state, const int32_t mvd_hor, const int32_t mvd_ver) { - double bitcost = 0; + double bitcost = 4 << CTX_FRAC_BITS; + const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; + bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); - const int8_t hor_abs_gr0 = mvd_hor != 0; - const int8_t ver_abs_gr0 = mvd_ver != 0; - const uint32_t mvd_hor_abs = abs(mvd_hor); - const uint32_t mvd_ver_abs = abs(mvd_ver); + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], (mvd_hor != 0)); - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], (mvd_ver != 0)); - - if (hor_abs_gr0) { - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], (mvd_hor_abs > 1)); - } - if (ver_abs_gr0) { - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], (mvd_ver_abs > 1)); - } - - if (hor_abs_gr0) { - if (mvd_hor_abs > 1) { - bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs - 2); - } - bitcost += 1; - } - if (ver_abs_gr0) { - if (mvd_ver_abs > 1) { - bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs - 2) ; - } - bitcost += 1; - } - - return bitcost; + // Round and shift back to integer bits. + return bitcost / (1 << CTX_FRAC_BITS); } From debe5924a9922cdf70dac7168636d3ce6c20db80 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 15 Apr 2022 11:38:31 +0300 Subject: [PATCH 111/135] Fix --- src/encode_coding_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index f31c950d..e37b9f16 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -900,7 +900,7 @@ void kvz_encode_intra_luma_coding_unit(const encoder_state_t * const state, CABAC_BIN_EP(cabac, mip_transpose, "mip_transposed"); if (cabac->only_count) bits += 1; kvz_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes, bits_out); - if (cabac->only_count) *bits_out += bits; + if (cabac->only_count && bits_out) *bits_out += bits; return; } } From 52240bde6eb93252a33fc52380fee727c2dc642e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 6 Apr 2022 14:52:32 +0300 Subject: [PATCH 112/135] ignore chroma during regular intra search --- src/search_intra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_intra.c b/src/search_intra.c index d896c091..558ddcd6 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -271,7 +271,7 @@ static double search_intra_trdepth(encoder_state_t * const state, const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - const bool reconstruct_chroma = (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != KVZ_CSP_400; + const bool reconstruct_chroma = false; // (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != KVZ_CSP_400; struct { kvz_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; From e74ea38673637a5d0a58242b87dd0a80700a2ff6 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 8 Apr 2022 08:53:11 +0300 Subject: [PATCH 113/135] WIP: initial structs etc. --- src/cabac.h | 2 +- src/intra.c | 18 +++++++++------- src/intra.h | 17 ++++++++++++---- src/search.c | 2 +- src/search.h | 4 ++-- src/search_intra.c | 51 ++++++++++++++++++++++++++++++---------------- 6 files changed, 62 insertions(+), 32 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index eff15220..65de6f92 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -159,7 +159,7 @@ extern const float kvz_f_entropy_bits[512]; #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] #define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ - if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \ + if((cabac)->only_count || 1) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \ if((cabac)->update) {\ (cabac)->cur_ctx = ctx;\ CABAC_BIN((cabac), (val), (name));\ diff --git a/src/intra.c b/src/intra.c index e3606eb9..c591ac62 100644 --- a/src/intra.c +++ b/src/intra.c @@ -730,10 +730,14 @@ void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* /** \brief Matrix weighted intra prediction. */ -void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* const refs, - const uint16_t pred_block_width, const uint16_t pred_block_height, - kvz_pixel* dst, - const int mip_mode, const bool mip_transp) +void kvz_mip_predict( + const encoder_state_t* const state, + kvz_intra_references* const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + kvz_pixel* dst, + const int mip_mode, + const bool mip_transp) { // MIP prediction uses int values instead of kvz_pixel as some temp values may be negative @@ -877,7 +881,7 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c void kvz_intra_predict( - encoder_state_t *const state, + const encoder_state_t* const state, kvz_intra_references *refs, int_fast8_t log2_width, int_fast8_t mode, @@ -1352,7 +1356,7 @@ void kvz_intra_build_reference( } static void intra_recon_tb_leaf( - encoder_state_t *const state, + const encoder_state_t* const state, int x, int y, int depth, @@ -1482,7 +1486,7 @@ static void intra_recon_tb_leaf( * \param lcu containing LCU */ void kvz_intra_recon_cu( - encoder_state_t *const state, + const encoder_state_t* const state, int x, int y, int depth, diff --git a/src/intra.h b/src/intra.h index e35f57e5..7228bb15 100644 --- a/src/intra.h +++ b/src/intra.h @@ -74,6 +74,15 @@ typedef struct { int8_t jccr; } intra_parameters_t; +typedef struct { + cu_info_t pred_cu; + cclm_parameters_t cclm_parameters[2]; + double cost; + double bits; + double coeff_bits; + double distortion; +} intra_search_data_t ; + /** * \brief Function for deriving intra luma predictions * \param x x-coordinate of the PU in pixels @@ -125,7 +134,7 @@ void kvz_intra_build_reference( * \param filter_boundary Whether to filter the boundary on modes 10 and 26. */ void kvz_intra_predict( - encoder_state_t *const state, + const encoder_state_t* const state, kvz_intra_references *refs, int_fast8_t log2_width, int_fast8_t mode, @@ -135,7 +144,7 @@ void kvz_intra_predict( const uint8_t multi_ref_idx); void kvz_intra_recon_cu( - encoder_state_t *const state, + const encoder_state_t* const state, int x, int y, int depth, @@ -162,11 +171,11 @@ void kvz_predict_cclm( int kvz_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a); void kvz_mip_predict( - encoder_state_t const * const state, + const encoder_state_t* const state, kvz_intra_references * refs, const uint16_t width, const uint16_t height, kvz_pixel* dst, const int mip_mode, const bool mip_transp -); \ No newline at end of file + ); \ No newline at end of file diff --git a/src/search.c b/src/search.c index 1ab13841..ca9fdd03 100644 --- a/src/search.c +++ b/src/search.c @@ -1015,7 +1015,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, kvz_intra_recon_cu(state, x & ~7, y & ~7, // TODO: as does this depth, - &intra_parameters, + &intra_parameters, NULL, lcu); if(depth != 0 && state->encoder_control->cfg.jccr) { diff --git a/src/search.h b/src/search.h index ba4ca57c..919b6cab 100644 --- a/src/search.h +++ b/src/search.h @@ -48,11 +48,11 @@ // Modify weight of luma SSD. #ifndef KVZ_LUMA_MULT -# define KVZ_LUMA_MULT 0.8 +#define KVZ_LUMA_MULT 0.8 #endif // Modify weight of chroma SSD. #ifndef KVZ_CHROMA_MULT -# define KVZ_CHROMA_MULT 1.5 +#define KVZ_CHROMA_MULT 1.5 #endif /** diff --git a/src/search_intra.c b/src/search_intra.c index 558ddcd6..9aa0023c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -171,6 +171,23 @@ static void get_cost_dual(encoder_state_t * const state, } +void INLINE rough_cost_prediction_mode(const encoder_state_t * const state, + const int x_px, + const int y_px, + const int depth, + intra_search_data_t * data, + lcu_t* lcu) +{ + kvz_intra_recon_cu( + state, + x_px, y_px, + depth, + &data, + &data->pred_cu, + lcu); +} + + /** * \brief Derives mts_last_scan_pos and violates_mts_coeff_constraint for pred_cu. * @@ -271,7 +288,7 @@ static double search_intra_trdepth(encoder_state_t * const state, const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - const bool reconstruct_chroma = false; // (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != KVZ_CSP_400; + const bool reconstruct_chroma = (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != KVZ_CSP_400; struct { kvz_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; @@ -344,11 +361,11 @@ static double search_intra_trdepth(encoder_state_t * const state, } kvz_intra_recon_cu(state, - x_px, y_px, - depth, - &intra_parameters, - pred_cu, - lcu); + x_px, y_px, + depth, + &intra_parameters, + pred_cu, + lcu); // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1 if (pred_cu->tr_idx > 1) @@ -376,11 +393,11 @@ static double search_intra_trdepth(encoder_state_t * const state, intra_parameters.chroma_mode = chroma_mode; intra_parameters.jccr = -1; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently kvz_intra_recon_cu(state, - x_px & ~7, y_px & ~7, - depth, - &intra_parameters, - pred_cu, - lcu); + x_px & ~7, y_px & ~7, + depth, + &intra_parameters, + pred_cu, + lcu); best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); } pred_cu->tr_skip = best_tr_idx == MTS_SKIP; @@ -1000,11 +1017,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, intra_parameters.chroma_mode = modes[chroma_mode_i]; if(chroma.mode < 67 || depth == 0) { kvz_intra_recon_cu(state, - x_px, y_px, - depth, - &intra_parameters, - NULL, - lcu); + x_px, y_px, + depth, + &intra_parameters, + NULL, + lcu); } else { @@ -1038,7 +1055,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, state, x_px, y_px, depth, - &intra_parameters, + &intra_parameters, NULL, lcu); } From 00acf661cf55d0045be9090b9c559067c057038c Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 8 Apr 2022 13:41:42 +0300 Subject: [PATCH 114/135] WIP: intra prediction generation from a singular function --- src/cu.h | 10 ++++ src/intra.c | 119 ++++++++++++++++++++++++++++++--------------- src/intra.h | 23 +++------ src/search_intra.c | 55 ++++++++++----------- 4 files changed, 120 insertions(+), 87 deletions(-) diff --git a/src/cu.h b/src/cu.h index 4be18926..f7a9e1df 100644 --- a/src/cu.h +++ b/src/cu.h @@ -183,6 +183,16 @@ typedef struct }; } cu_info_t; +typedef struct { + int16_t x; + int16_t y; + int8_t width; + int8_t height; + int8_t chroma_width; + int8_t chroma_height; +} cu_loc_t; + + #define CU_GET_MV_CAND(cu_info_ptr, reflist) \ (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1) diff --git a/src/intra.c b/src/intra.c index c591ac62..edf3fe06 100644 --- a/src/intra.c +++ b/src/intra.c @@ -82,6 +82,17 @@ static const uint8_t num_ref_pixels_left[16][16] = { { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 } }; + +void static mip_predict( + const encoder_state_t* const state, + kvz_intra_references* const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + kvz_pixel* dst, + const int mip_mode, + const bool mip_transp); + + int8_t kvz_intra_get_dir_luma_predictor( const uint32_t x, const uint32_t y, @@ -728,9 +739,10 @@ void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* } + /** \brief Matrix weighted intra prediction. */ -void kvz_mip_predict( +void static mip_predict( const encoder_state_t* const state, kvz_intra_references* const refs, const uint16_t pred_block_width, @@ -880,14 +892,13 @@ void kvz_mip_predict( } -void kvz_intra_predict( +void static intra_predict_regular( const encoder_state_t* const state, kvz_intra_references *refs, int_fast8_t log2_width, int_fast8_t mode, color_t color, kvz_pixel *dst, - bool filter_boundary, const uint8_t multi_ref_idx) { const int_fast8_t width = 1 << log2_width; @@ -1355,6 +1366,58 @@ void kvz_intra_build_reference( } } + +void kvz_intra_predict( + const encoder_state_t* const state, + kvz_intra_references* const refs, + const cu_loc_t* const cu_loc, + const color_t color, + kvz_pixel* dst, + intra_search_data_t* data, + lcu_t* lcu +) +{ + const kvz_config* cfg = &state->encoder_control->cfg; + const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); + // TODO: what is this used for? + // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); + bool use_mip = false; + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int x = cu_loc->x; + const int y = cu_loc->y; + int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma; + if (data->pred_cu.intra.mip_flag) { + if (color == COLOR_Y) { + use_mip = true; + } + else { + use_mip = state->encoder_control->chroma_format == KVZ_CSP_444; + } + } + if (intra_mode < 68) { + if (use_mip) { + assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]"); + mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed); + } + else { + intra_predict_regular(state, refs, kvz_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx); + } + } + else { + kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width); + if (data->pred_cu.depth != data->pred_cu.tr_depth) { + cclm_parameters_t temp_params; + kvz_predict_cclm( + state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, &temp_params); + } + else { + linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width); + } + } +} + + static void intra_recon_tb_leaf( const encoder_state_t* const state, int x, @@ -1407,44 +1470,20 @@ static void intra_recon_tb_leaf( kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index); kvz_pixel pred[32 * 32]; - const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); - const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); - bool use_mip = false; - int8_t intra_mode = color == COLOR_Y ? intra_paramas->luma_mode : intra_paramas->chroma_mode; - if (intra_paramas->mip_flag) { - if (color == COLOR_Y) { - use_mip = true; - } else { - // MIP can be used for chroma if the chroma scheme is 444 - if (state->encoder_control->chroma_format == KVZ_CSP_444) { - use_mip = true; - } else { - // If MIP cannot be used for chroma, set mode to planar - intra_mode = 0; - } - } - } - if(intra_mode < 68) { - if (use_mip) { - assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]"); - kvz_mip_predict(state, &refs, width, height, pred, intra_mode, intra_paramas->mip_transp); - } - else { - kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index); - } - } - else { - kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width); - if (LCU_GET_CU_AT_PX(lcu, x_scu, y_scu)->depth != depth) { - cclm_parameters_t temp_params; - kvz_predict_cclm( - state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params); - } - else { - linear_transform_cclm(&intra_paramas->cclm_parameters[color == COLOR_U ? 0 : 1], pred, pred, width, width); - } - } + cu_loc_t loc = { + x, y, + width, width, + width, width, + }; + intra_search_data_t search_data; + search_data.pred_cu.intra.mip_flag = intra_paramas->mip_flag; + search_data.pred_cu.intra.multi_ref_idx = intra_paramas->multi_ref_idx; + search_data.pred_cu.intra.mode = intra_paramas->luma_mode; + search_data.pred_cu.intra.mode_chroma = intra_paramas->chroma_mode; + search_data.pred_cu.tr_depth = depth; + search_data.pred_cu.depth = depth; + kvz_intra_predict(state, &refs, &loc, color, pred, &search_data, lcu); const int index = lcu_px.x + lcu_px.y * lcu_width; kvz_pixel *block = NULL; diff --git a/src/intra.h b/src/intra.h index 7228bb15..a0d21a68 100644 --- a/src/intra.h +++ b/src/intra.h @@ -135,13 +135,12 @@ void kvz_intra_build_reference( */ void kvz_intra_predict( const encoder_state_t* const state, - kvz_intra_references *refs, - int_fast8_t log2_width, - int_fast8_t mode, - color_t color, - kvz_pixel *dst, - bool filter_boundary, - const uint8_t multi_ref_idx); + kvz_intra_references* const refs, + const cu_loc_t* const cu_loc, + const color_t color, + kvz_pixel* dst, + intra_search_data_t* data, + lcu_t* lcu); void kvz_intra_recon_cu( const encoder_state_t* const state, @@ -169,13 +168,3 @@ void kvz_predict_cclm( ); int kvz_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a); - -void kvz_mip_predict( - const encoder_state_t* const state, - kvz_intra_references * refs, - const uint16_t width, - const uint16_t height, - kvz_pixel* dst, - const int mip_mode, - const bool mip_transp - ); \ No newline at end of file diff --git a/src/search_intra.c b/src/search_intra.c index 9aa0023c..dcb2ada8 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -171,20 +171,18 @@ static void get_cost_dual(encoder_state_t * const state, } -void INLINE rough_cost_prediction_mode(const encoder_state_t * const state, - const int x_px, - const int y_px, +void INLINE rough_cost_prediction_mode(const encoder_state_t* const state, + kvz_intra_references* const references, + const cu_loc_t* const cu_loc, const int depth, + const color_t color, intra_search_data_t * data, lcu_t* lcu) { - kvz_intra_recon_cu( - state, - x_px, y_px, - depth, - &data, - &data->pred_cu, - lcu); + const int width = MAX(4, (color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C) >> depth); + const int height= MAX(4, (color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C) >> depth); + kvz_pixel pred[TR_MAX_WIDTH * TR_MAX_WIDTH + SIMD_ALIGNMENT]; + // kvz_intra_predict(state, references, width, height, pred, data); } @@ -497,7 +495,6 @@ static void search_intra_chroma_rough(encoder_state_t * const state, assert(!(x_px & 4 || y_px & 4)); const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); - const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - (depth + 1), 2); for (int i = 0; i < 8; ++i) { costs[i] = 0; @@ -505,6 +502,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state, cost_pixel_nxn_func *const satd_func = kvz_pixels_get_satd_func(width); //cost_pixel_nxn_func *const sad_func = kvz_pixels_get_sad_func(width); + cu_loc_t loc = { x_px, y_px, width, width, width, width }; cclm_parameters_t cclm_params; @@ -515,32 +513,22 @@ static void search_intra_chroma_rough(encoder_state_t * const state, kvz_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width); - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < (state->encoder_control->cfg.cclm ? 8 : 5); ++i) { if (modes[i] == -1) continue; - kvz_intra_predict(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false, 0); + kvz_intra_predict(state, refs_u, &loc, COLOR_U, pred, NULL, lcu); + // kvz_intra_predict_regular(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false, 0); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); costs[i] += satd_func(pred, orig_block); } - for (int i = 5; i < 8; i++) { - assert(state->encoder_control->cfg.cclm); - kvz_predict_cclm( - state, - COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params); - } kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width); - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < (state->encoder_control->cfg.cclm ? 8 : 5); ++i) { if (modes[i] == -1) continue; - kvz_intra_predict(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false, 0); + kvz_intra_predict(state, refs_u, &loc, COLOR_V, pred, NULL, lcu); + //kvz_intra_predict_regular(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false, 0); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); costs[i] += satd_func(pred, orig_block); } - for (int i = 5; i < 8; i++) { - assert(state->encoder_control->cfg.cclm); - kvz_predict_cclm( - state, - COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params); - } kvz_sort_modes(modes, costs, 5); } @@ -620,12 +608,17 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Calculate SAD for evenly spaced modes to select the starting point for // the recursive search. + cu_loc_t loc = { 0, 0, width, width, width, width }; + intra_search_data_t search_proxy; + FILL(search_proxy, 0); + for (int mode = 2; mode <= 66; mode += PARALLEL_BLKS * offset) { double costs_out[PARALLEL_BLKS] = { 0 }; for (int i = 0; i < PARALLEL_BLKS; ++i) { if (mode + i * offset <= 66) { - kvz_intra_predict(state, refs, log2_width, mode + i * offset, COLOR_Y, preds[i], filter_boundary, 0); + search_proxy.pred_cu.intra.mode = mode + i*offset; + kvz_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL); } } @@ -664,7 +657,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, if (mode_in_range) { for (int i = 0; i < PARALLEL_BLKS; ++i) { if (test_modes[i] >= 2 && test_modes[i] <= 66) { - kvz_intra_predict(state, refs, log2_width, test_modes[i], COLOR_Y, preds[i], filter_boundary, 0); + search_proxy.pred_cu.intra.mode = test_modes[i]; + kvz_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL); } } @@ -701,7 +695,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, } if (!has_mode) { - kvz_intra_predict(state, refs, log2_width, mode, COLOR_Y, preds[0], filter_boundary, 0); + search_proxy.pred_cu.intra.mode = mode; + kvz_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL); costs[modes_selected] = get_cost(state, preds[0], orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; From f7563076cb2969d9521db1d6a42b0759e9bfcf5b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 11 Apr 2022 09:58:37 +0300 Subject: [PATCH 115/135] WIP --- src/cu.h | 2 +- src/intra.c | 50 +++---- src/intra.h | 17 +-- src/search.c | 45 +++--- src/search_intra.c | 339 ++++++++++++++++++++------------------------- src/search_intra.h | 5 +- 6 files changed, 191 insertions(+), 267 deletions(-) diff --git a/src/cu.h b/src/cu.h index f7a9e1df..2b342e95 100644 --- a/src/cu.h +++ b/src/cu.h @@ -148,7 +148,7 @@ typedef struct uint8_t merge_idx : 3; //!< \brief merge index uint8_t tr_skip : 1; //!< \brief transform skip flag uint8_t tr_idx : 3; //!< \brief transform index - uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding + uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding uint16_t cbf; diff --git a/src/intra.c b/src/intra.c index edf3fe06..12314e14 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1369,13 +1369,13 @@ void kvz_intra_build_reference( void kvz_intra_predict( const encoder_state_t* const state, - kvz_intra_references* const refs, + const kvz_intra_references* const refs, const cu_loc_t* const cu_loc, const color_t color, kvz_pixel* dst, intra_search_data_t* data, - lcu_t* lcu -) + const lcu_t* lcu + ) { const kvz_config* cfg = &state->encoder_control->cfg; const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); @@ -1406,10 +1406,9 @@ void kvz_intra_predict( } else { kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width); - if (data->pred_cu.depth != data->pred_cu.tr_depth) { - cclm_parameters_t temp_params; + if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) { kvz_predict_cclm( - state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, &temp_params); + state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, &data->cclm_parameters[color == COLOR_U ? 0 : 1]); } else { linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width); @@ -1425,7 +1424,7 @@ static void intra_recon_tb_leaf( int depth, lcu_t *lcu, color_t color, - const intra_parameters_t* intra_paramas) + const intra_search_data_t* search_data) { const kvz_config *cfg = &state->encoder_control->cfg; const int shift = color == COLOR_Y ? 0 : 1; @@ -1447,7 +1446,7 @@ static void intra_recon_tb_leaf( int x_scu = SUB_SCU(x); int y_scu = SUB_SCU(y); const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift }; - uint8_t multi_ref_index = color == COLOR_Y ? intra_paramas->multi_ref_idx : 0; + uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0; kvz_intra_references refs; // Extra reference lines for use with MRL. Extra lines needed only for left edge. @@ -1476,14 +1475,8 @@ static void intra_recon_tb_leaf( width, width, width, width, }; - intra_search_data_t search_data; - search_data.pred_cu.intra.mip_flag = intra_paramas->mip_flag; - search_data.pred_cu.intra.multi_ref_idx = intra_paramas->multi_ref_idx; - search_data.pred_cu.intra.mode = intra_paramas->luma_mode; - search_data.pred_cu.intra.mode_chroma = intra_paramas->chroma_mode; - search_data.pred_cu.tr_depth = depth; - search_data.pred_cu.depth = depth; - kvz_intra_predict(state, &refs, &loc, color, pred, &search_data, lcu); + + kvz_intra_predict(state, &refs, &loc, color, pred, search_data, lcu); const int index = lcu_px.x + lcu_px.y * lcu_width; kvz_pixel *block = NULL; @@ -1529,7 +1522,7 @@ void kvz_intra_recon_cu( int x, int y, int depth, - const intra_parameters_t* intra_parameters, + intra_search_data_t* search_data, cu_info_t *cur_cu, lcu_t *lcu) { @@ -1538,12 +1531,11 @@ void kvz_intra_recon_cu( if (cur_cu == NULL) { cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); } - bool use_mip = intra_parameters->mip_flag; - const int8_t mode_luma = intra_parameters->luma_mode; - const int8_t mode_chroma= intra_parameters->chroma_mode; + const int8_t mode_luma = search_data->pred_cu.intra.mode; + const int8_t mode_chroma= search_data->pred_cu.intra.mode_chroma; if (mode_luma != -1 && mode_chroma != -1) { - if (use_mip) { + if (search_data->pred_cu.intra.mip_flag) { assert(mode_luma == mode_chroma && "Chroma mode must be derived from luma mode if block uses MIP."); } } @@ -1564,10 +1556,10 @@ void kvz_intra_recon_cu( const int32_t x2 = x + offset; const int32_t y2 = y + offset; - kvz_intra_recon_cu(state, x, y, depth + 1, intra_parameters, NULL, lcu); - kvz_intra_recon_cu(state, x2, y, depth + 1, intra_parameters, NULL, lcu); - kvz_intra_recon_cu(state, x, y2, depth + 1, intra_parameters, NULL, lcu); - kvz_intra_recon_cu(state, x2, y2, depth + 1, intra_parameters, NULL, lcu); + kvz_intra_recon_cu(state, x, y, depth + 1, search_data, NULL, lcu); + kvz_intra_recon_cu(state, x2, y, depth + 1, search_data, NULL, lcu); + kvz_intra_recon_cu(state, x, y2, depth + 1, search_data, NULL, lcu); + kvz_intra_recon_cu(state, x2, y2, depth + 1, search_data, NULL, lcu); // Propagate coded block flags from child CUs to parent CU. uint16_t child_cbfs[3] = { @@ -1589,13 +1581,13 @@ void kvz_intra_recon_cu( // Process a leaf TU. if (has_luma) { - intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, intra_parameters); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data); } if (has_chroma) { - intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, intra_parameters); - intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, intra_parameters); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data); } - kvz_quantize_lcu_residual(state, has_luma, has_chroma, intra_parameters->jccr != -1 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0), x, y, depth, cur_cu, lcu, false); + kvz_quantize_lcu_residual(state, has_luma, has_chroma, search_data->pred_cu.joint_cb_cr != 4, x, y, depth, cur_cu, lcu, false); } } diff --git a/src/intra.h b/src/intra.h index a0d21a68..2982bff3 100644 --- a/src/intra.h +++ b/src/intra.h @@ -63,17 +63,6 @@ typedef struct int16_t b; } cclm_parameters_t; -typedef struct { - int8_t luma_mode; - int8_t chroma_mode; - cclm_parameters_t cclm_parameters[2]; - uint8_t multi_ref_idx; - bool mip_flag; - bool mip_transp; - int8_t mts_idx; - int8_t jccr; -} intra_parameters_t; - typedef struct { cu_info_t pred_cu; cclm_parameters_t cclm_parameters[2]; @@ -135,19 +124,19 @@ void kvz_intra_build_reference( */ void kvz_intra_predict( const encoder_state_t* const state, - kvz_intra_references* const refs, + const kvz_intra_references* const refs, const cu_loc_t* const cu_loc, const color_t color, kvz_pixel* dst, intra_search_data_t* data, - lcu_t* lcu); + const lcu_t* lcu); void kvz_intra_recon_cu( const encoder_state_t* const state, int x, int y, int depth, - const intra_parameters_t * intra_parameters, + intra_search_data_t* search_data, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search.c b/src/search.c index ca9fdd03..01dc9447 100644 --- a/src/search.c +++ b/src/search.c @@ -950,12 +950,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) && !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I); - intra_parameters_t intra_parameters; + intra_search_data_t intra_search; if (can_use_intra && !skip_intra) { - double intra_cost; - intra_parameters.jccr = -1; - kvz_search_cu_intra(state, x, y, depth, lcu, - &intra_cost, &intra_parameters); + intra_search.pred_cu = *cur_cu; + intra_search.pred_cu.joint_cb_cr = 4; + kvz_search_cu_intra(state, x, y, depth, &intra_search, + lcu); #ifdef COMPLETE_PRED_MODE_BITS // Technically counting these bits would be correct, however counting // them universally degrades quality so this block is disabled by default @@ -966,17 +966,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, intra_cost += pred_mode_type_bits * state->lambda; } #endif - if (intra_cost < cost) { - cost = intra_cost; + if (intra_search.cost < cost) { + cost = intra_search.cost; + *cur_cu = intra_search.pred_cu; cur_cu->type = CU_INTRA; - cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N; - cur_cu->intra.mode = intra_parameters.luma_mode; - cur_cu->intra.multi_ref_idx = intra_parameters.multi_ref_idx; - cur_cu->intra.mip_flag = intra_parameters.mip_flag; - cur_cu->intra.mip_is_transposed = intra_parameters.mip_transp; - - //If the CU is not split from 64x64 block, the MTS is disabled for that CU. - cur_cu->tr_idx = (depth > 0) ? intra_parameters.mts_idx : 0; } } @@ -984,14 +977,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // mode search of adjacent CUs. if (cur_cu->type == CU_INTRA) { assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN); - cur_cu->intra.mode_chroma = cur_cu->intra.mode; - + + intra_search.pred_cu.intra.mode_chroma = -1; // don't reconstruct chroma before search is performed for it lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); - intra_parameters.chroma_mode = -1; kvz_intra_recon_cu(state, x, y, - depth, - &intra_parameters, + depth, &intra_search, NULL, lcu); @@ -1006,16 +997,15 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // into account, so there is less of a chanse of luma mode being // really bad for chroma. if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) { - cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, intra_parameters.cclm_parameters); + cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, intra_search.cclm_parameters); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } - intra_parameters.chroma_mode = cur_cu->intra.mode_chroma; - intra_parameters.luma_mode = -1; // skip luma - intra_parameters.jccr = 0; + intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode; + intra_search.pred_cu.intra.mode = -1; // skip luma + intra_search.pred_cu.joint_cb_cr = 0; kvz_intra_recon_cu(state, x & ~7, y & ~7, // TODO: as does this - depth, - &intra_parameters, + depth, &intra_search, NULL, lcu); if(depth != 0 && state->encoder_control->cfg.jccr) { @@ -1223,8 +1213,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, }; kvz_intra_recon_cu(state, x, y, - depth, - &intra_parameters, + depth, , NULL, lcu); diff --git a/src/search_intra.c b/src/search_intra.c index dcb2ada8..b4e94900 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -269,13 +269,15 @@ static void derive_mts_constraints(cu_info_t *const pred_cu, * \param cost_treshold RD cost at which search can be stopped. * \param mts_mode Selected MTS mode for current intra mode. */ -static double search_intra_trdepth(encoder_state_t * const state, - int x_px, int y_px, int depth, int max_depth, - int intra_mode, int cost_treshold, - cu_info_t *const pred_cu, - lcu_t *const lcu, - cclm_parameters_t *cclm_params, - const int mts_mode) +static double search_intra_trdepth( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + int max_depth, + int cost_treshold, + intra_search_data_t *const search_data, + lcu_t *const lcu) { assert(depth >= 0 && depth <= MAX_PU_DEPTH); @@ -284,9 +286,9 @@ static double search_intra_trdepth(encoder_state_t * const state, const int offset = width / 2; const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; - cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); const bool reconstruct_chroma = (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != KVZ_CSP_400; + cu_info_t* pred_cu = &search_data->pred_cu; struct { kvz_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; @@ -300,7 +302,6 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth > 0) { const bool mts_enabled = state->encoder_control->cfg.mts == KVZ_MTS_INTRA || state->encoder_control->cfg.mts == KVZ_MTS_BOTH; - tr_cu->tr_depth = depth; pred_cu->tr_depth = depth; nosplit_cost = 0.0; @@ -311,16 +312,16 @@ static double search_intra_trdepth(encoder_state_t * const state, cbf_clear(&pred_cu->cbf, depth, COLOR_V); } - const int8_t chroma_mode = reconstruct_chroma ? intra_mode : -1; + const int8_t chroma_mode = reconstruct_chroma ? pred_cu->intra.mode : -1; double best_rd_cost = MAX_INT; int best_tr_idx = 0; int trafo; int num_transforms = 1; - if (mts_mode != -1) + if (pred_cu->tr_idx != MTS_TR_NUM) { - trafo = mts_mode; - num_transforms = mts_mode + 1; + trafo = pred_cu->tr_idx; + num_transforms = pred_cu->tr_idx + 1; } else { @@ -332,20 +333,8 @@ static double search_intra_trdepth(encoder_state_t * const state, num_transforms = MAX(num_transforms, 2); } - intra_parameters_t intra_parameters = { - .luma_mode = intra_mode, - -1, - {{0, 0, 0}, {0, 0 ,0}}, - pred_cu->intra.multi_ref_idx, - pred_cu->intra.mip_flag, - pred_cu->intra.mip_is_transposed, - 0, - -1, - }; - for (; trafo < num_transforms; trafo++) { pred_cu->tr_idx = trafo; - intra_parameters.mts_idx = trafo; if (mts_enabled) { pred_cu->mts_last_scan_pos = 0; @@ -360,8 +349,7 @@ static double search_intra_trdepth(encoder_state_t * const state, kvz_intra_recon_cu(state, x_px, y_px, - depth, - &intra_parameters, + depth, search_data, pred_cu, lcu); @@ -371,7 +359,7 @@ static double search_intra_trdepth(encoder_state_t * const state, derive_mts_constraints(pred_cu, lcu, depth, lcu_px); if (pred_cu->violates_mts_coeff_constraint || !pred_cu->mts_last_scan_pos) { - assert(mts_mode == -1); //mts mode should not be decided and then not allowed to be used. (might be some exception here) + assert(pred_cu->tr_idx == MTS_TR_NUM); //mts mode should not be decided and then not allowed to be used. (might be some exception here) continue; } } @@ -387,13 +375,12 @@ static double search_intra_trdepth(encoder_state_t * const state, } } if(reconstruct_chroma) { - intra_parameters.luma_mode = -1; - intra_parameters.chroma_mode = chroma_mode; - intra_parameters.jccr = -1; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently + pred_cu->intra.mode = -1; + pred_cu->intra.mode_chroma = chroma_mode; + pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently kvz_intra_recon_cu(state, x_px & ~7, y_px & ~7, - depth, - &intra_parameters, + depth, search_data, pred_cu, lcu); best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); @@ -426,15 +413,15 @@ static double search_intra_trdepth(encoder_state_t * const state, if (depth < max_depth && depth < MAX_PU_DEPTH) { split_cost = 0; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu); if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu); } double cbf_bits = 0.0; @@ -483,29 +470,44 @@ static double search_intra_trdepth(encoder_state_t * const state, return nosplit_cost; } } +void sort_modes(intra_search_data_t* __restrict modes, uint8_t length) +{ + // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about + // 60% of the time, so there should be no need for anything more complex + // than insertion sort. + // Length for merge is 5 or less. + for (uint8_t i = 1; i < length; ++i) { + const intra_search_data_t cur_cost = modes[i]; + uint8_t j = i; + while (j > 0 && cur_cost.cost < modes[j - 1].cost) { + modes[j] = modes[j - 1]; + --j; + } + modes[j] = cur_cost; + } +} - -static void search_intra_chroma_rough(encoder_state_t * const state, - int x_px, int y_px, int depth, - const kvz_pixel *orig_u, const kvz_pixel *orig_v, int16_t origstride, - kvz_intra_references *refs_u, kvz_intra_references *refs_v, - int8_t luma_mode, - int8_t modes[8], double costs[8], lcu_t* lcu) +static void search_intra_chroma_rough( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + const kvz_pixel *orig_u, + const kvz_pixel *orig_v, + int16_t origstride, + kvz_intra_references *refs_u, + kvz_intra_references *refs_v, + intra_search_data_t* chroma_data, + lcu_t* lcu) { assert(!(x_px & 4 || y_px & 4)); const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); - for (int i = 0; i < 8; ++i) { - costs[i] = 0; - } - cost_pixel_nxn_func *const satd_func = kvz_pixels_get_satd_func(width); //cost_pixel_nxn_func *const sad_func = kvz_pixels_get_sad_func(width); cu_loc_t loc = { x_px, y_px, width, width, width, width }; - - cclm_parameters_t cclm_params; - + kvz_pixel _pred[32 * 32 + SIMD_ALIGNMENT]; kvz_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT); @@ -513,24 +515,27 @@ static void search_intra_chroma_rough(encoder_state_t * const state, kvz_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width); - for (int i = 0; i < (state->encoder_control->cfg.cclm ? 8 : 5); ++i) { - if (modes[i] == -1) continue; - kvz_intra_predict(state, refs_u, &loc, COLOR_U, pred, NULL, lcu); - // kvz_intra_predict_regular(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false, 0); + int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5); + for (int i = 0; i < modes_count; ++i) { + if (chroma_data[i].pred_cu.intra.mode_chroma == -1) continue; + kvz_intra_predict(state, refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); - costs[i] += satd_func(pred, orig_block); + chroma_data[i].cost += satd_func(pred, orig_block); } kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width); - for (int i = 0; i < (state->encoder_control->cfg.cclm ? 8 : 5); ++i) { - if (modes[i] == -1) continue; - kvz_intra_predict(state, refs_u, &loc, COLOR_V, pred, NULL, lcu); - //kvz_intra_predict_regular(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false, 0); + for (int i = 0; i < modes_count; ++i) { + if (chroma_data[i].pred_cu.intra.mode_chroma == -1) continue; + kvz_intra_predict(state, refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); - costs[i] += satd_func(pred, orig_block); + chroma_data[i].cost += satd_func(pred, orig_block); } - kvz_sort_modes(modes, costs, 5); + for (int i = 0; i < modes_count; ++i) { + const double bits = kvz_chroma_mode_bits(state, chroma_data[i].pred_cu.intra.mode_chroma, chroma_data[i].pred_cu.intra.mode); + chroma_data[i].bits = bits; + chroma_data[i].cost = bits * state->lambda_sqrt; + } } @@ -847,7 +852,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // Reset transform split data in lcu.cu for this area. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_mode, MAX_INT, &pred_cu, lcu, NULL, -1); + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, , lcu); *mode_cost_p += mode_cost; *mode_trafo_p = pred_cu.tr_idx; @@ -874,34 +879,34 @@ static int8_t search_intra_rdo(encoder_state_t * const state, // The best transform split hierarchy is not saved anywhere, so to get the // transform split hierarchy the search has to be performed again with the // best mode. - if (tr_depth != depth) { - cu_info_t pred_cu; - pred_cu.depth = depth; - pred_cu.type = CU_INTRA; - pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); - pred_cu.skipped = 0; - pred_cu.merged = 0; - pred_cu.bdpcmMode = 0; - if (use_mip) { - int transp_off = num_mip_modes_full >> 1; - bool is_transposed = (mip_modes[0] >= transp_off ? true : false); - int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]); - pred_cu.intra.mode = pred_mode; - pred_cu.intra.mode_chroma = pred_mode; - pred_cu.intra.multi_ref_idx = 0; - pred_cu.intra.mip_flag = true; - pred_cu.intra.mip_is_transposed = is_transposed; - } - else { - pred_cu.intra.mode = modes[0]; - pred_cu.intra.mode_chroma = modes[0]; - pred_cu.intra.multi_ref_idx = multi_ref_idx; - pred_cu.intra.mip_flag = false; - pred_cu.intra.mip_is_transposed = false; - } - FILL(pred_cu.cbf, 0); - search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]); - } + //if (tr_depth != depth) { + // cu_info_t pred_cu; + // pred_cu.depth = depth; + // pred_cu.type = CU_INTRA; + // pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); + // pred_cu.skipped = 0; + // pred_cu.merged = 0; + // pred_cu.bdpcmMode = 0; + // if (use_mip) { + // int transp_off = num_mip_modes_full >> 1; + // bool is_transposed = (mip_modes[0] >= transp_off ? true : false); + // int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]); + // pred_cu.intra.mode = pred_mode; + // pred_cu.intra.mode_chroma = pred_mode; + // pred_cu.intra.multi_ref_idx = 0; + // pred_cu.intra.mip_flag = true; + // pred_cu.intra.mip_is_transposed = is_transposed; + // } + // else { + // pred_cu.intra.mode = modes[0]; + // pred_cu.intra.mode_chroma = modes[0]; + // pred_cu.intra.multi_ref_idx = multi_ref_idx; + // pred_cu.intra.mip_flag = false; + // pred_cu.intra.mip_is_transposed = false; + // } + // FILL(pred_cu.cbf, 0); + // search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]); + //} // TODO: modes to check does not consider mip modes. Maybe replace with array when mip search is optimized? return modes_to_check; @@ -958,11 +963,14 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in } -int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, - int x_px, int y_px, int depth, - int8_t intra_mode, - int8_t modes[8], int8_t num_modes, - lcu_t *const lcu, cclm_parameters_t *best_cclm) +int8_t kvz_search_intra_chroma_rdo( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + int8_t num_modes, + lcu_t *const lcu, + intra_search_data_t* chroma_data) { const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4); @@ -981,101 +989,33 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0); kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0); - - cclm_parameters_t cclm_params[2] = { 0 }; - + const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - - struct { - double cost; - int8_t mode; - cclm_parameters_t cclm[2]; - int8_t jccr; - } chroma, best_chroma; - - // chroma.cclm = cclm_params; - - best_chroma.mode = 0; - best_chroma.cost = MAX_INT; - best_chroma.jccr = 0; - - intra_parameters_t intra_parameters; - memset(&intra_parameters, 0, sizeof(intra_parameters_t)); - intra_parameters.luma_mode = -1; // skip luma - - chroma.jccr = 0; - - for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) { - chroma.mode = modes[chroma_mode_i]; - if (chroma.mode == -1) continue; - intra_parameters.chroma_mode = modes[chroma_mode_i]; - if(chroma.mode < 67 || depth == 0) { + + for (int8_t i = 0; i < num_modes; ++i) { + const uint8_t mode = chroma_data[i].pred_cu.intra.mode_chroma; + if(mode < 67 || depth == 0) { kvz_intra_recon_cu(state, x_px, y_px, - depth, - &intra_parameters, + depth, &chroma_data[i], NULL, lcu); } - else { - - kvz_predict_cclm( - state, COLOR_U, - c_width, c_width, - x_px & ~7, y_px & ~7, - state->tile->frame->source->stride, - chroma.mode, - lcu, - &refs[0], NULL, - &cclm_params[0]); - - chroma.cclm[0] = cclm_params[0]; - intra_parameters.cclm_parameters[0] = cclm_params[0]; - - kvz_predict_cclm( - state, COLOR_V, - c_width, c_width, - x_px & ~7, y_px & ~7, - state->tile->frame->source->stride, - chroma.mode, - lcu, - &refs[1], NULL, - &cclm_params[1]); - - chroma.cclm[1] = cclm_params[1]; - intra_parameters.cclm_parameters[1] = cclm_params[1]; - - kvz_intra_recon_cu( - state, - x_px, y_px, - depth, - &intra_parameters, - NULL, - lcu); - } + double bits = 0; if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) { - chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + chroma_data[i].cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); } else { - chroma.cost = 0; - kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &chroma.cost); - chroma.jccr = tr_cu->joint_cb_cr; + kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &chroma_data[i].cost); } - double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); - bits += mode_bits; - chroma.cost += mode_bits * state->lambda; - - if (chroma.cost < best_chroma.cost) { - best_chroma = chroma; - } + double mode_bits = kvz_chroma_mode_bits(state, mode, chroma_data[i].pred_cu.intra.mode); + chroma_data[i].cost += mode_bits * state->lambda; } - best_cclm[0] = best_chroma.cclm[0]; - best_cclm[1] = best_chroma.cclm[1]; - tr_cu->joint_cb_cr = best_chroma.jccr; + sort_modes(chroma_data, num_modes); - return best_chroma.mode; + return chroma_data[0].pred_cu.intra.mode_chroma; } return 100; @@ -1090,12 +1030,19 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); int8_t intra_mode = cur_pu->intra.mode; - - double costs[8]; + int8_t modes[8] = { 0, 50, 18, 1, -1, 81, 82, 83 }; + uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5); if (intra_mode != 0 && intra_mode != 50 && intra_mode != 18 && intra_mode != 1) { modes[4] = intra_mode; } + else { + total_modes -= 1; + modes[4] = modes[5]; + modes[5] = modes[6]; + modes[6] = modes[7]; + } + // The number of modes to select for slower chroma search. Luma mode // is always one of the modes, so 2 means the final decision is made @@ -1105,13 +1052,20 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, int num_modes = modes_in_depth[depth]; if (state->encoder_control->cfg.rdo >= 3) { - num_modes = state->encoder_control->cfg.cclm ? 8 : 5; + num_modes = total_modes; } + intra_search_data_t chroma_data[8]; + FILL(chroma_data, 0); + for (int i = 0; i < num_modes; i++) { + chroma_data[i].pred_cu = *cur_pu; + chroma_data[i].pred_cu.intra.mode_chroma = modes[i]; + } // Don't do rough mode search if all modes are selected. // FIXME: It might make more sense to only disable rough search if // num_modes is 0.is 0. - if (num_modes != 1 && num_modes != 5 && num_modes != 4 && num_modes != 8) { + + if (total_modes != num_modes) { const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2); const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; const vector2d_t luma_px = { x_px, y_px }; @@ -1127,14 +1081,16 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, kvz_pixel *ref_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; search_intra_chroma_rough(state, x_px, y_px, depth, - ref_u, ref_v, LCU_WIDTH_C, + ref_u, ref_v, + LCU_WIDTH_C, &refs_u, &refs_v, - intra_mode, modes, costs, lcu); + chroma_data, lcu); + sort_modes(chroma_data, total_modes); } int8_t intra_mode_chroma = intra_mode; if (num_modes > 1) { - intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu, best_cclm); + intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data); } return intra_mode_chroma; @@ -1150,9 +1106,8 @@ void kvz_search_cu_intra( const int x_px, const int y_px, const int depth, - lcu_t *lcu, - double *cost_out, - intra_parameters_t* intra_parameters) + intra_search_data_t* search_data, + lcu_t *lcu) { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; const int8_t cu_width = LCU_WIDTH >> depth; @@ -1323,10 +1278,10 @@ void kvz_search_cu_intra( tmp_best_mode = (tmp_mip_transp ? tmp_best_mode - (num_mip_modes >> 1) : tmp_best_mode); } - intra_parameters->luma_mode = tmp_best_mode; - intra_parameters->mts_idx = tmp_best_trafo; + search_data->luma_mode = tmp_best_mode; + search_data->mts_idx = tmp_best_trafo; *cost_out = tmp_best_cost; - intra_parameters->mip_flag = tmp_mip_flag; - intra_parameters->mip_transp = tmp_mip_transp; - intra_parameters->multi_ref_idx = tmp_mip_flag ? 0 : best_line; + search_data->mip_flag = tmp_mip_flag; + search_data->mip_transp = tmp_mip_transp; + search_data->multi_ref_idx = tmp_mip_flag ? 0 : best_line; } diff --git a/src/search_intra.h b/src/search_intra.h index 8376889f..33df7f2e 100644 --- a/src/search_intra.h +++ b/src/search_intra.h @@ -57,8 +57,7 @@ void kvz_search_cu_intra( const int x_px, const int y_px, const int depth, - lcu_t *lcu, - double *cost_out, - intra_parameters_t* intra_parameters); + intra_search_data_t* search_data, + lcu_t *lcu); #endif // SEARCH_INTRA_H_ From b093248ca7f15ff095745abcfc8e6f96e7391278 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 12 Apr 2022 08:39:30 +0300 Subject: [PATCH 116/135] Maybe working for regular intra search --- src/encode_coding_tree.c | 3 +- src/intra.c | 4 +- src/intra.h | 3 + src/search.c | 15 +- src/search_intra.c | 375 +++++++++++++-------------------------- 5 files changed, 135 insertions(+), 265 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index e37b9f16..e6f39926 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1507,8 +1507,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, // For 4x4 the chroma PU/TU is coded after the last if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) { encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); - + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); } } diff --git a/src/intra.c b/src/intra.c index 12314e14..09ced04b 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1588,6 +1588,8 @@ void kvz_intra_recon_cu( intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data); } - kvz_quantize_lcu_residual(state, has_luma, has_chroma, search_data->pred_cu.joint_cb_cr != 4, x, y, depth, cur_cu, lcu, false); + kvz_quantize_lcu_residual(state, has_luma, has_chroma, + search_data->pred_cu.joint_cb_cr != 4 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0), + x, y, depth, cur_cu, lcu, false); } } diff --git a/src/intra.h b/src/intra.h index 2982bff3..59aa10c9 100644 --- a/src/intra.h +++ b/src/intra.h @@ -72,6 +72,9 @@ typedef struct { double distortion; } intra_search_data_t ; + +#define KVZ_NUM_INTRA_MODES 67 + /** * \brief Function for deriving intra luma predictions * \param x x-coordinate of the PU in pixels diff --git a/src/search.c b/src/search.c index 01dc9447..93f78561 100644 --- a/src/search.c +++ b/src/search.c @@ -1008,6 +1008,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, depth, &intra_search, NULL, lcu); + cur_cu->intra.mode_chroma = intra_search.pred_cu.intra.mode_chroma; + cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr; if(depth != 0 && state->encoder_control->cfg.jccr) { kvz_select_jccr_mode(state, x & ~7, y & ~7, @@ -1201,19 +1203,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1; - intra_parameters_t intra_parameters = { - .luma_mode = cur_cu->intra.mode, - .chroma_mode = mode_chroma, - .cclm_parameters ={{0, 0, 0}, {0, 0 ,0}}, - 0, - 0, - 0, - 0, - -1, - }; + kvz_intra_recon_cu(state, x, y, - depth, , + depth, NULL, NULL, lcu); diff --git a/src/search_intra.c b/src/search_intra.c index b4e94900..5f6ad457 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -171,18 +171,23 @@ static void get_cost_dual(encoder_state_t * const state, } -void INLINE rough_cost_prediction_mode(const encoder_state_t* const state, +double static INLINE rough_cost_prediction_mode(const encoder_state_t* const state, kvz_intra_references* const references, const cu_loc_t* const cu_loc, - const int depth, + kvz_pixel *ref_pixels, const color_t color, intra_search_data_t * data, lcu_t* lcu) { - const int width = MAX(4, (color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C) >> depth); - const int height= MAX(4, (color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C) >> depth); - kvz_pixel pred[TR_MAX_WIDTH * TR_MAX_WIDTH + SIMD_ALIGNMENT]; - // kvz_intra_predict(state, references, width, height, pred, data); + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + cost_pixel_nxn_func* satd_func = kvz_pixels_get_satd_func(width); + cost_pixel_nxn_func* sad_func = kvz_pixels_get_sad_func(width); + + kvz_pixel _pred[TR_MAX_WIDTH * TR_MAX_WIDTH + SIMD_ALIGNMENT]; + kvz_pixel* pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT); + kvz_intra_predict(state, references, cu_loc, color, pred, data, lcu); + + double cost = get_cost(state, pred, ref_pixels, satd_func, sad_func, width); } @@ -375,6 +380,7 @@ static double search_intra_trdepth( } } if(reconstruct_chroma) { + int8_t luma_mode = pred_cu->intra.mode; pred_cu->intra.mode = -1; pred_cu->intra.mode_chroma = chroma_mode; pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently @@ -384,6 +390,7 @@ static double search_intra_trdepth( pred_cu, lcu); best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + pred_cu->intra.mode = luma_mode; } pred_cu->tr_skip = best_tr_idx == MTS_SKIP; pred_cu->tr_idx = best_tr_idx; @@ -569,11 +576,15 @@ static void search_intra_chroma_rough( * * \return Number of prediction modes in param modes. */ -static int8_t search_intra_rough(encoder_state_t * const state, - kvz_pixel *orig, int32_t origstride, - kvz_intra_references *refs, - int log2_width, int8_t *intra_preds, - int8_t modes[67], double costs[67]) +static int8_t search_intra_rough( + encoder_state_t * const state, + kvz_pixel *orig, + int32_t origstride, + kvz_intra_references *refs, + int log2_width, + int8_t *intra_preds, + intra_search_data_t* modes_out, + cu_info_t* const pred_cu) { #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future? assert(log2_width >= 2 && log2_width <= 5); @@ -582,6 +593,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, cost_pixel_nxn_func *sad_func = kvz_pixels_get_sad_func(width); cost_pixel_nxn_multi_func *satd_dual_func = kvz_pixels_get_satd_dual_func(width); cost_pixel_nxn_multi_func *sad_dual_func = kvz_pixels_get_sad_dual_func(width); + int8_t modes[KVZ_NUM_INTRA_MODES]; + double costs[KVZ_NUM_INTRA_MODES]; const kvz_config *cfg = &state->encoder_control->cfg; const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm); @@ -616,6 +629,7 @@ static int8_t search_intra_rough(encoder_state_t * const state, cu_loc_t loc = { 0, 0, width, width, width, width }; intra_search_data_t search_proxy; FILL(search_proxy, 0); + search_proxy.pred_cu = *pred_cu; for (int mode = 2; mode <= 66; mode += PARALLEL_BLKS * offset) { @@ -685,10 +699,10 @@ static int8_t search_intra_rough(encoder_state_t * const state, } } - int8_t add_modes[5] = {intra_preds[0], intra_preds[1], intra_preds[2], 0, 1}; + int8_t add_modes[INTRA_MPM_COUNT + 2] = {intra_preds[0], intra_preds[1], intra_preds[2], intra_preds[3], intra_preds[4], intra_preds[5], 0, 1}; // Add DC, planar and missing predicted modes. - for (int8_t pred_i = 0; pred_i < 5; ++pred_i) { + for (int8_t pred_i = 0; pred_i < (INTRA_MPM_COUNT + 2); ++pred_i) { bool has_mode = false; int8_t mode = add_modes[pred_i]; @@ -719,10 +733,10 @@ static int8_t search_intra_rough(encoder_state_t * const state, int smaller_than_pred = 0; double bits; for(; i < INTRA_MPM_COUNT; i++) { - if (intra_preds[i] == mode_i) { + if (intra_preds[i] == modes[mode_i]) { break; } - if(mode_i > intra_preds[i]) { + if(modes[mode_i] > intra_preds[i]) { smaller_than_pred += 1; } } @@ -736,12 +750,25 @@ static int8_t search_intra_rough(encoder_state_t * const state, bits = not_mpm_mode_bit + 5 + (mode_i - smaller_than_pred > 3); } costs[mode_i] += state->lambda_sqrt * bits; + modes_out[mode_i].cost = costs[mode_i]; + modes_out[mode_i].pred_cu = *pred_cu; + modes_out[mode_i].pred_cu.intra.mode = modes[mode_i]; + modes_out[mode_i].pred_cu.intra.mode_chroma = modes[mode_i]; } #undef PARALLEL_BLKS return modes_selected; } + +void search_mip_rough( + encoder_state_t* const state, + cu_loc_t* const cu_loc) +{ + +} + + /** * \brief Find best intra mode out of the ones listed in parameter modes. * @@ -768,147 +795,41 @@ static int8_t search_intra_rough(encoder_state_t * const state, * \param[out] lcu If transform split searching is used, the transform split * information for the best mode is saved in lcu.cu structure. */ -static int8_t search_intra_rdo(encoder_state_t * const state, - int x_px, int y_px, int depth, - kvz_pixel *orig, int32_t origstride, - int8_t *intra_preds, - int modes_to_check, - int8_t modes[67], int8_t trafo[67], double costs[67], - int num_mip_modes_full, - int8_t mip_modes[32], int8_t mip_trafo[32], double mip_costs[32], - lcu_t *lcu, - uint8_t multi_ref_idx) +static int8_t search_intra_rdo( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + int modes_to_check, + intra_search_data_t *search_data, + lcu_t *lcu) { const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra); - const int width = LCU_WIDTH >> depth; - const int height = width; // TODO: proper height for non-square blocks - - kvz_pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1]; - - kvz_pixels_blit(orig, orig_block, width, height, origstride, width); - - // Check that the predicted modes are in the RDO mode list - if (modes_to_check < 67) { - int pred_mode = 0; - // Skip planar if searching modes for MRL - if (multi_ref_idx != 0) { - pred_mode = 1; - } - for (; pred_mode < 6; pred_mode++) { - int mode_found = 0; - for (int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode++) { - if (intra_preds[pred_mode] == modes[rdo_mode]) { - mode_found = 1; - break; - } - } - // Add this prediction mode to RDO checking - if (!mode_found) { - modes[modes_to_check] = intra_preds[pred_mode]; - modes_to_check++; - } - } - } - - // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad. - // MIP_TODO: loop through normal intra modes first - for (int mip = 0; mip <= 1; mip++) { - const int transp_off = mip ? num_mip_modes_full >> 1 : 0; - uint8_t multi_ref_index = mip ? 0 : multi_ref_idx; - int *num_modes = mip ? &num_mip_modes_full : &modes_to_check; + for (int mode = 0; mode < modes_to_check; mode++) { + double rdo_bitcost = kvz_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu); + search_data[mode].bits = rdo_bitcost; + search_data[mode].cost = rdo_bitcost * state->lambda; - for (uint8_t i = 0; i < *num_modes; i++) { - int8_t mode = mip ? mip_modes[i] : modes[i]; - double *mode_cost_p = mip ? &mip_costs[i] : &costs[i]; - int8_t *mode_trafo_p = mip ? &mip_trafo[i] : &trafo[i]; - - // Mip related stuff - // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream. - // Half of the modes [16, 31] are indicated with the separate transpose flag. - // Number of possible modes is less for larger blocks. - const bool is_transposed = mip ? (mode >= transp_off ? true : false) : 0; - int8_t pred_mode = (is_transposed ? mode - transp_off : mode); - - // Perform transform split search and save mode RD cost for the best one. - cu_info_t pred_cu; - pred_cu.depth = depth; - pred_cu.type = CU_INTRA; - pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks - pred_cu.skipped = 0; - pred_cu.merged = 0; - pred_cu.bdpcmMode = 0; - pred_cu.intra.mode = pred_mode; - pred_cu.intra.mode_chroma = pred_mode; - pred_cu.intra.multi_ref_idx = multi_ref_index; - pred_cu.intra.mip_is_transposed = is_transposed; - pred_cu.intra.mip_flag = mip ? true : false; - pred_cu.joint_cb_cr = 0; - FILL(pred_cu.cbf, 0); - - double rdo_bitcost = kvz_luma_mode_bits(state, &pred_cu, x_px, y_px, depth, lcu); - *mode_cost_p = rdo_bitcost * state->lambda; - - // Reset transform split data in lcu.cu for this area. - kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, , lcu); - *mode_cost_p += mode_cost; - *mode_trafo_p = pred_cu.tr_idx; - - // Early termination if no coefficients has to be coded - if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) { - *num_modes = i + 1; - break; - } + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu); + search_data[mode].cost += mode_cost; + if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) { + modes_to_check = mode + 1; + break; } } // Update order according to new costs - kvz_sort_modes_intra_luma(modes, trafo, costs, modes_to_check); - bool use_mip = false; - if (num_mip_modes_full) { - kvz_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes_full); - if (costs[0] > mip_costs[0]) { - use_mip = true; + double best_cost = MAX_INT; + int best_mode; + for (int mode = 0; mode < modes_to_check; mode++) { + if(search_data[mode].cost < best_cost) { + best_cost = search_data[mode].cost; + best_mode = mode; } } - + search_data[0] = search_data[best_mode]; - - // The best transform split hierarchy is not saved anywhere, so to get the - // transform split hierarchy the search has to be performed again with the - // best mode. - //if (tr_depth != depth) { - // cu_info_t pred_cu; - // pred_cu.depth = depth; - // pred_cu.type = CU_INTRA; - // pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); - // pred_cu.skipped = 0; - // pred_cu.merged = 0; - // pred_cu.bdpcmMode = 0; - // if (use_mip) { - // int transp_off = num_mip_modes_full >> 1; - // bool is_transposed = (mip_modes[0] >= transp_off ? true : false); - // int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]); - // pred_cu.intra.mode = pred_mode; - // pred_cu.intra.mode_chroma = pred_mode; - // pred_cu.intra.multi_ref_idx = 0; - // pred_cu.intra.mip_flag = true; - // pred_cu.intra.mip_is_transposed = is_transposed; - // } - // else { - // pred_cu.intra.mode = modes[0]; - // pred_cu.intra.mode_chroma = modes[0]; - // pred_cu.intra.multi_ref_idx = multi_ref_idx; - // pred_cu.intra.mip_flag = false; - // pred_cu.intra.mip_is_transposed = false; - // } - // FILL(pred_cu.cbf, 0); - // search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]); - //} - - // TODO: modes to check does not consider mip modes. Maybe replace with array when mip search is optimized? return modes_to_check; } @@ -1106,7 +1027,7 @@ void kvz_search_cu_intra( const int x_px, const int y_px, const int depth, - intra_search_data_t* search_data, + intra_search_data_t* mode_out, lcu_t *lcu) { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; @@ -1118,6 +1039,8 @@ void kvz_search_cu_intra( kvz_intra_references refs; int8_t candidate_modes[INTRA_MPM_COUNT]; + // Normal intra modes + mrl modes + mip modes + intra_search_data_t search_data[KVZ_NUM_INTRA_MODES +(MAX_REF_LINE_IDX - 1) * (INTRA_MPM_COUNT - 1) + 32]; cu_info_t *left_cu = 0; cu_info_t *above_cu = 0; @@ -1140,62 +1063,64 @@ void kvz_search_cu_intra( kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs, state->encoder_control->cfg.wpp, NULL, 0); } - int8_t modes[MAX_REF_LINE_IDX][67]; - int8_t trafo[MAX_REF_LINE_IDX][67] = { 0 }; - double costs[MAX_REF_LINE_IDX][67]; - - bool enable_mip = state->encoder_control->cfg.mip; - // The maximum number of mip modes is 32. Max modes can be less depending on block size. - // Half of the possible modes are transposed, which is indicated by a separate transpose flag - int8_t mip_modes[32]; - int8_t mip_trafo[32]; - double mip_costs[32]; - // The maximum number of possible MIP modes depend on block size & shape int width = LCU_WIDTH >> depth; int height = width; // TODO: proper height for non-square blocks. - int num_mip_modes = 0; - - if (enable_mip) { - for (int i = 0; i < 32; ++i) { - mip_modes[i] = i; - mip_costs[i] = MAX_INT; - } - // MIP is not allowed for 64 x 4 or 4 x 64 blocks - if (!((width == 64 && height == 4) || (width == 4 && height == 64))) { - num_mip_modes = NUM_MIP_MODES_FULL(width, height); - } - } // Find best intra mode for 2Nx2N. kvz_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH]; - int8_t number_of_modes[MAX_REF_LINE_IDX] = { 0 }; + // Need to set some data for all cus + cu_info_t temp_pred_cu; + FILL(temp_pred_cu, 0); + temp_pred_cu.depth = depth; + temp_pred_cu.type = CU_INTRA; + + int8_t number_of_modes; bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4); if (!skip_rough_search) { - number_of_modes[0] = search_intra_rough(state, - ref_pixels, LCU_WIDTH, + number_of_modes = search_intra_rough(state, + ref_pixels, + LCU_WIDTH, &refs, log2_width, candidate_modes, - modes[0], costs[0]); - // Copy rough results for other reference lines - for (int line = 1; line < MAX_REF_LINE_IDX; ++line) { - number_of_modes[line] = number_of_modes[0]; - for (int i = 0; i < number_of_modes[line]; ++i) { - modes[line][i] = modes[0][i]; - costs[line][i] = costs[0][i]; - } - } + search_data, &temp_pred_cu); + } else { - for(int line = 0; line < MAX_REF_LINE_IDX; ++line) { - number_of_modes[line] = 67; - for (int i = 0; i < number_of_modes[line]; ++i) { - modes[line][i] = i; - costs[line][i] = MAX_INT; + for (int8_t i = 0; i < KVZ_NUM_INTRA_MODES; i++) { + search_data[i].pred_cu = temp_pred_cu; + search_data[i].pred_cu.intra.mode = i; + search_data[i].pred_cu.intra.mode_chroma = i; + search_data[i].cost = MAX_INT; + } + number_of_modes = KVZ_NUM_INTRA_MODES; + } + + int num_mip_modes = 0; + if (state->encoder_control->cfg.mip) { + // MIP is not allowed for 64 x 4 or 4 x 64 blocks + if (!((width == 64 && height == 4) || (width == 4 && height == 64))) { + num_mip_modes = NUM_MIP_MODES_FULL(width, height); + } + for (int transpose = 0; transpose < 2; transpose++) { + const int half_mip_modes = NUM_MIP_MODES_HALF(width, height); + for (int i = 0; i < half_mip_modes; ++i) { + const int index = i + number_of_modes + transpose * half_mip_modes; + search_data[index].pred_cu = temp_pred_cu; + search_data[index].pred_cu.intra.mip_flag = 1; + search_data[index].pred_cu.intra.mode = i; + search_data[index].pred_cu.intra.mip_is_transposed = transpose; + search_data[index].pred_cu.intra.mode_chroma = 0; + search_data[index].cost = MAX_INT; } } + if(!skip_rough_search) { + + } + } + uint8_t lines = 1; // Find modes with multiple reference lines if in use. Do not use if CU in first row. if (state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0) { @@ -1216,72 +1141,20 @@ void kvz_search_cu_intra( // Check only the predicted modes. number_of_modes_to_search = 0; } + sort_modes(search_data, number_of_modes); + + + // TODO: if rough search is implemented for MIP, sort mip_modes here. + search_intra_rdo( + state, + x_px, + y_px, + depth, + number_of_modes_to_search, + search_data, + lcu); - for(int8_t line = 0; line < lines; ++line) { - // For extra reference lines, only check predicted modes & no MIP search. - if (line != 0) { - number_of_modes_to_search = 0; - num_mip_modes = 0; - } - int num_modes_to_check = MIN(number_of_modes[line], number_of_modes_to_search); - kvz_sort_modes(modes[line], costs[line], number_of_modes[line]); - // TODO: if rough search is implemented for MIP, sort mip_modes here. - number_of_modes[line] = search_intra_rdo(state, - x_px, y_px, depth, - ref_pixels, LCU_WIDTH, - candidate_modes, - num_modes_to_check, - modes[line], trafo[line], costs[line], - num_mip_modes, - mip_modes, mip_trafo, mip_costs, - lcu, line); - } } - uint8_t best_line = 0; - double best_line_mode_cost = costs[0][0]; - uint8_t best_mip_mode_idx = 0; - uint8_t best_mode_indices[MAX_REF_LINE_IDX]; - - int8_t tmp_best_mode; - int8_t tmp_best_trafo; - double tmp_best_cost; - bool tmp_mip_flag = false; - bool tmp_mip_transp = false; - - for (int line = 0; line < lines; ++line) { - best_mode_indices[line] = select_best_mode_index(modes[line], costs[line], number_of_modes[line]); - if (best_line_mode_cost > costs[line][best_mode_indices[line]]) { - best_line_mode_cost = costs[line][best_mode_indices[line]]; - best_line = line; - } - } - - tmp_best_mode = modes[best_line][best_mode_indices[best_line]]; - tmp_best_trafo = trafo[best_line][best_mode_indices[best_line]]; - tmp_best_cost = costs[best_line][best_mode_indices[best_line]]; - - if (num_mip_modes) { - best_mip_mode_idx = select_best_mode_index(mip_modes, mip_costs, num_mip_modes); - if (tmp_best_cost > mip_costs[best_mip_mode_idx]) { - tmp_best_mode = mip_modes[best_mip_mode_idx]; - tmp_best_trafo = mip_trafo[best_mip_mode_idx]; - tmp_best_cost = mip_costs[best_mip_mode_idx]; - tmp_mip_flag = true; - tmp_mip_transp = (tmp_best_mode >= (num_mip_modes >> 1)) ? 1 : 0; - } - } - - if (tmp_mip_flag) { - // Transform best mode index to proper form. - // Max mode index is half of max number of modes - 1 (i. e. for size id 2, max mode id is 5) - tmp_best_mode = (tmp_mip_transp ? tmp_best_mode - (num_mip_modes >> 1) : tmp_best_mode); - } - - search_data->luma_mode = tmp_best_mode; - search_data->mts_idx = tmp_best_trafo; - *cost_out = tmp_best_cost; - search_data->mip_flag = tmp_mip_flag; - search_data->mip_transp = tmp_mip_transp; - search_data->multi_ref_idx = tmp_mip_flag ? 0 : best_line; + *mode_out = search_data[0]; } From 88c01b6d3285957a832701f1e8d971636a5ed2b8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 12 Apr 2022 14:51:49 +0300 Subject: [PATCH 117/135] Fix warnings/errors --- src/intra.c | 38 +++++++++++++++++++------------------- src/intra.h | 25 +++++-------------------- src/search.c | 10 ++++++---- src/search_intra.c | 22 +++++++++++++--------- 4 files changed, 43 insertions(+), 52 deletions(-) diff --git a/src/intra.c b/src/intra.c index 09ced04b..e9cdd1de 100644 --- a/src/intra.c +++ b/src/intra.c @@ -83,9 +83,9 @@ static const uint8_t num_ref_pixels_left[16][16] = { }; -void static mip_predict( +static void mip_predict( const encoder_state_t* const state, - kvz_intra_references* const refs, + const kvz_intra_references* const refs, const uint16_t pred_block_width, const uint16_t pred_block_height, kvz_pixel* dst, @@ -479,7 +479,7 @@ static void linear_transform_cclm(const cclm_parameters_t* cclm_params, kvz_pixe } -void kvz_predict_cclm( +void predict_cclm( encoder_state_t const* const state, const color_t color, const int8_t width, @@ -488,7 +488,7 @@ void kvz_predict_cclm( const int16_t y0, const int16_t stride, const int8_t mode, - lcu_t* const lcu, + const lcu_t* const lcu, kvz_intra_references* chroma_ref, kvz_pixel* dst, cclm_parameters_t* cclm_params @@ -508,7 +508,7 @@ void kvz_predict_cclm( int available_left_below = 0; - kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH; + const kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH; const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); // Essentially what this does is that it uses 6-tap filtering to downsample @@ -520,7 +520,7 @@ void kvz_predict_cclm( if (y0) { for (; available_above_right < width / 2; available_above_right++) { int x_extension = x_scu + width * 2 + 4 * available_above_right; - cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4); + const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4); if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break; } if(y_scu == 0) { @@ -545,7 +545,7 @@ void kvz_predict_cclm( if(x0) { for (; available_left_below < height / 2; available_left_below++) { int y_extension = y_scu + height * 2 + 4 * available_left_below; - cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension); + const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension); if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break; if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break; } @@ -742,9 +742,9 @@ void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* /** \brief Matrix weighted intra prediction. */ -void static mip_predict( +static void mip_predict( const encoder_state_t* const state, - kvz_intra_references* const refs, + const kvz_intra_references* const refs, const uint16_t pred_block_width, const uint16_t pred_block_height, kvz_pixel* dst, @@ -892,7 +892,7 @@ void static mip_predict( } -void static intra_predict_regular( +static void intra_predict_regular( const encoder_state_t* const state, kvz_intra_references *refs, int_fast8_t log2_width, @@ -1369,15 +1369,14 @@ void kvz_intra_build_reference( void kvz_intra_predict( const encoder_state_t* const state, - const kvz_intra_references* const refs, + kvz_intra_references* const refs, const cu_loc_t* const cu_loc, const color_t color, kvz_pixel* dst, - intra_search_data_t* data, + const intra_search_data_t* data, const lcu_t* lcu ) { - const kvz_config* cfg = &state->encoder_control->cfg; const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); // TODO: what is this used for? // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); @@ -1407,8 +1406,9 @@ void kvz_intra_predict( else { kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width); if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) { - kvz_predict_cclm( - state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, &data->cclm_parameters[color == COLOR_U ? 0 : 1]); + predict_cclm( + state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, + (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]); } else { linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width); @@ -1418,7 +1418,7 @@ void kvz_intra_predict( static void intra_recon_tb_leaf( - const encoder_state_t* const state, + encoder_state_t* const state, int x, int y, int depth, @@ -1472,8 +1472,8 @@ static void intra_recon_tb_leaf( cu_loc_t loc = { x, y, - width, width, - width, width, + width, height, + width, height, }; kvz_intra_predict(state, &refs, &loc, color, pred, search_data, lcu); @@ -1518,7 +1518,7 @@ static void intra_recon_tb_leaf( * \param lcu containing LCU */ void kvz_intra_recon_cu( - const encoder_state_t* const state, + encoder_state_t* const state, int x, int y, int depth, diff --git a/src/intra.h b/src/intra.h index 59aa10c9..a5f46ed2 100644 --- a/src/intra.h +++ b/src/intra.h @@ -127,15 +127,16 @@ void kvz_intra_build_reference( */ void kvz_intra_predict( const encoder_state_t* const state, - const kvz_intra_references* const refs, + kvz_intra_references* const refs, const cu_loc_t* const cu_loc, const color_t color, kvz_pixel* dst, - intra_search_data_t* data, - const lcu_t* lcu); + const intra_search_data_t* data, + const lcu_t* lcu +); void kvz_intra_recon_cu( - const encoder_state_t* const state, + encoder_state_t* const state, int x, int y, int depth, @@ -143,20 +144,4 @@ void kvz_intra_recon_cu( cu_info_t *cur_cu, lcu_t *lcu); - -void kvz_predict_cclm( - encoder_state_t const* const state, - const color_t color, - const int8_t width, - const int8_t height, - const int16_t x0, - const int16_t y0, - const int16_t stride, - const int8_t mode, - lcu_t* const lcu, - kvz_intra_references* chroma_ref, - kvz_pixel* dst, - cclm_parameters_t* cclm_params -); - int kvz_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a); diff --git a/src/search.c b/src/search.c index 93f78561..a3fcbc93 100644 --- a/src/search.c +++ b/src/search.c @@ -1200,13 +1200,15 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, kvz_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); - - const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1; + + intra_search_data_t proxy; + FILL(proxy, 0); + proxy.pred_cu = *cur_cu; kvz_intra_recon_cu(state, x, y, - depth, NULL, + depth, + &proxy, NULL, lcu); diff --git a/src/search_intra.c b/src/search_intra.c index 5f6ad457..d616184f 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -171,7 +171,7 @@ static void get_cost_dual(encoder_state_t * const state, } -double static INLINE rough_cost_prediction_mode(const encoder_state_t* const state, + static INLINE double rough_cost_prediction_mode(encoder_state_t* const state, kvz_intra_references* const references, const cu_loc_t* const cu_loc, kvz_pixel *ref_pixels, @@ -188,6 +188,7 @@ double static INLINE rough_cost_prediction_mode(const encoder_state_t* const sta kvz_intra_predict(state, references, cu_loc, color, pred, data, lcu); double cost = get_cost(state, pred, ref_pixels, satd_func, sad_func, width); + return cost; } @@ -596,8 +597,8 @@ static int8_t search_intra_rough( int8_t modes[KVZ_NUM_INTRA_MODES]; double costs[KVZ_NUM_INTRA_MODES]; - const kvz_config *cfg = &state->encoder_control->cfg; - const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm); + // const kvz_config *cfg = &state->encoder_control->cfg; + // const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm); // Temporary block arrays kvz_pixel _preds[PARALLEL_BLKS * 32 * 32 + SIMD_ALIGNMENT]; @@ -821,7 +822,7 @@ static int8_t search_intra_rdo( // Update order according to new costs double best_cost = MAX_INT; - int best_mode; + int best_mode = 0; for (int mode = 0; mode < modes_to_check; mode++) { if(search_data[mode].cost < best_cost) { best_cost = search_data[mode].cost; @@ -905,9 +906,6 @@ int8_t kvz_search_intra_chroma_rdo( if (reconstruct_chroma) { - - int c_width = MAX(32 >> (depth), 4); - kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0); kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0); @@ -924,7 +922,6 @@ int8_t kvz_search_intra_chroma_rdo( lcu); } - double bits = 0; if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) { chroma_data[i].cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); } else { @@ -1119,13 +1116,20 @@ void kvz_search_cu_intra( } } + number_of_modes += num_mip_modes; - + int num_mrl_modes = 0; uint8_t lines = 1; // Find modes with multiple reference lines if in use. Do not use if CU in first row. if (state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0) { lines = MAX_REF_LINE_IDX; } + for(int line = 1; line < lines; ++line) { + for(int i = 1; i < INTRA_MPM_COUNT; i++) { + num_mrl_modes++; + } + } + number_of_modes += num_mrl_modes; // Set transform depth to current depth, meaning no transform splits. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); From 5f1e9c820f872939b5c1d137badb8aa876feeddb Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 10:53:27 +0300 Subject: [PATCH 118/135] Rough search for mrl and mip --- src/search_intra.c | 114 ++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 48 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index d616184f..e9eb0bc8 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -171,27 +171,6 @@ static void get_cost_dual(encoder_state_t * const state, } - static INLINE double rough_cost_prediction_mode(encoder_state_t* const state, - kvz_intra_references* const references, - const cu_loc_t* const cu_loc, - kvz_pixel *ref_pixels, - const color_t color, - intra_search_data_t * data, - lcu_t* lcu) -{ - const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - cost_pixel_nxn_func* satd_func = kvz_pixels_get_satd_func(width); - cost_pixel_nxn_func* sad_func = kvz_pixels_get_sad_func(width); - - kvz_pixel _pred[TR_MAX_WIDTH * TR_MAX_WIDTH + SIMD_ALIGNMENT]; - kvz_pixel* pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT); - kvz_intra_predict(state, references, cu_loc, color, pred, data, lcu); - - double cost = get_cost(state, pred, ref_pixels, satd_func, sad_func, width); - return cost; -} - - /** * \brief Derives mts_last_scan_pos and violates_mts_coeff_constraint for pred_cu. * @@ -577,7 +556,7 @@ static void search_intra_chroma_rough( * * \return Number of prediction modes in param modes. */ -static int8_t search_intra_rough( +static int16_t search_intra_rough( encoder_state_t * const state, kvz_pixel *orig, int32_t origstride, @@ -762,11 +741,39 @@ static int8_t search_intra_rough( } -void search_mip_rough( - encoder_state_t* const state, - cu_loc_t* const cu_loc) +static void get_rough_cost_for_n_modes( + encoder_state_t* const state, + kvz_intra_references* refs, + const cu_loc_t* const cu_loc, + kvz_pixel *orig, + int orig_stride, + intra_search_data_t *search_data, + int num_modes) { +#define PARALLEL_BLKS 2 + assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_n_modes"); + const int width = cu_loc->width; + cost_pixel_nxn_multi_func* satd_dual_func = kvz_pixels_get_satd_dual_func(width); + cost_pixel_nxn_multi_func* sad_dual_func = kvz_pixels_get_sad_dual_func(width); + + kvz_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT]; + pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT); + + kvz_pixel _orig_block[MIN(LCU_WIDTH, 64) * MIN(LCU_WIDTH, 64) + SIMD_ALIGNMENT]; + kvz_pixel* orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); + + kvz_pixels_blit(orig, orig_block, width, width, orig_stride, width); + double costs_out[PARALLEL_BLKS] = { 0 }; + for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) { + for (int i = 0; i < PARALLEL_BLKS; ++i) { + kvz_intra_predict(state, refs, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL); + } + get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); + search_data[mode].cost = costs_out[0]; + search_data[mode + 1].cost = costs_out[1]; + } +#undef PARALLEL_BLKS } @@ -1029,6 +1036,8 @@ void kvz_search_cu_intra( { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; const int8_t cu_width = LCU_WIDTH >> depth; + const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width, + MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) }; const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth; cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); @@ -1073,7 +1082,7 @@ void kvz_search_cu_intra( temp_pred_cu.depth = depth; temp_pred_cu.type = CU_INTRA; - int8_t number_of_modes; + int16_t number_of_modes; bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4); if (!skip_rough_search) { number_of_modes = search_intra_rough(state, @@ -1098,38 +1107,47 @@ void kvz_search_cu_intra( // MIP is not allowed for 64 x 4 or 4 x 64 blocks if (!((width == 64 && height == 4) || (width == 4 && height == 64))) { num_mip_modes = NUM_MIP_MODES_FULL(width, height); - } - for (int transpose = 0; transpose < 2; transpose++) { - const int half_mip_modes = NUM_MIP_MODES_HALF(width, height); - for (int i = 0; i < half_mip_modes; ++i) { - const int index = i + number_of_modes + transpose * half_mip_modes; - search_data[index].pred_cu = temp_pred_cu; - search_data[index].pred_cu.intra.mip_flag = 1; - search_data[index].pred_cu.intra.mode = i; - search_data[index].pred_cu.intra.mip_is_transposed = transpose; - search_data[index].pred_cu.intra.mode_chroma = 0; - search_data[index].cost = MAX_INT; + + for (int transpose = 0; transpose < 2; transpose++) { + const int half_mip_modes = NUM_MIP_MODES_HALF(width, height); + for (int i = 0; i < half_mip_modes; ++i) { + const int index = i + number_of_modes + transpose * half_mip_modes; + search_data[index].pred_cu = temp_pred_cu; + search_data[index].pred_cu.intra.mip_flag = 1; + search_data[index].pred_cu.intra.mode = i; + search_data[index].pred_cu.intra.mip_is_transposed = transpose; + search_data[index].pred_cu.intra.mode_chroma = 0; + search_data[index].cost = MAX_INT; + } + } + if(!skip_rough_search) { + get_rough_cost_for_n_modes(state, &refs, &cu_loc, + ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mip_modes); } } - if(!skip_rough_search) { - - } - + number_of_modes += num_mip_modes; } - number_of_modes += num_mip_modes; int num_mrl_modes = 0; - uint8_t lines = 1; // Find modes with multiple reference lines if in use. Do not use if CU in first row. - if (state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0) { - lines = MAX_REF_LINE_IDX; - } + uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1; + for(int line = 1; line < lines; ++line) { for(int i = 1; i < INTRA_MPM_COUNT; i++) { num_mrl_modes++; + const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes; + search_data[index].pred_cu = temp_pred_cu; + search_data[index].pred_cu.intra.mode = candidate_modes[i]; + search_data[index].pred_cu.intra.multi_ref_idx = line; + search_data[index].pred_cu.intra.mode_chroma = 0; + search_data[index].cost = MAX_INT; } + if (!skip_rough_search) { + get_rough_cost_for_n_modes(state, &refs, &cu_loc, + ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mrl_modes); + } + number_of_modes += num_mrl_modes; } - number_of_modes += num_mrl_modes; // Set transform depth to current depth, meaning no transform splits. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); @@ -1138,7 +1156,7 @@ void kvz_search_cu_intra( if (rdo_level >= 2 || skip_rough_search) { int number_of_modes_to_search; if (rdo_level == 4) { - number_of_modes_to_search = 67; + number_of_modes_to_search = number_of_modes; } else if (rdo_level == 2 || rdo_level == 3) { number_of_modes_to_search = (cu_width == 4) ? 3 : 2; } else { From 6aa8240db0391435c10993cb612718d9b077765e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 10:58:39 +0300 Subject: [PATCH 119/135] Fix test_external_symbols --- src/intra.c | 2 +- src/search_intra.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intra.c b/src/intra.c index e9cdd1de..f2fc2658 100644 --- a/src/intra.c +++ b/src/intra.c @@ -479,7 +479,7 @@ static void linear_transform_cclm(const cclm_parameters_t* cclm_params, kvz_pixe } -void predict_cclm( +static void predict_cclm( encoder_state_t const* const state, const color_t color, const int8_t width, diff --git a/src/search_intra.c b/src/search_intra.c index e9eb0bc8..ee53ad59 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -457,7 +457,7 @@ static double search_intra_trdepth( return nosplit_cost; } } -void sort_modes(intra_search_data_t* __restrict modes, uint8_t length) +static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length) { // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about // 60% of the time, so there should be no need for anything more complex From f75b2fdb34194e9e453d53d24c3574d431a4072f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 12:32:59 +0300 Subject: [PATCH 120/135] Fix tr_depth setting and restore checking all pred_modes rd cost --- src/search_intra.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index ee53ad59..c4dfe61e 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -274,6 +274,7 @@ static double search_intra_trdepth( const bool reconstruct_chroma = (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != KVZ_CSP_400; cu_info_t* pred_cu = &search_data->pred_cu; + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); struct { kvz_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; @@ -287,6 +288,7 @@ static double search_intra_trdepth( if (depth > 0) { const bool mts_enabled = state->encoder_control->cfg.mts == KVZ_MTS_INTRA || state->encoder_control->cfg.mts == KVZ_MTS_BOTH; + tr_cu->tr_depth = depth; pred_cu->tr_depth = depth; nosplit_cost = 0.0; @@ -1078,9 +1080,9 @@ void kvz_search_cu_intra( // Need to set some data for all cus cu_info_t temp_pred_cu; - FILL(temp_pred_cu, 0); - temp_pred_cu.depth = depth; + temp_pred_cu = *cur_cu; temp_pred_cu.type = CU_INTRA; + FILL(temp_pred_cu.intra, 0); int16_t number_of_modes; bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4); @@ -1139,7 +1141,7 @@ void kvz_search_cu_intra( search_data[index].pred_cu = temp_pred_cu; search_data[index].pred_cu.intra.mode = candidate_modes[i]; search_data[index].pred_cu.intra.multi_ref_idx = line; - search_data[index].pred_cu.intra.mode_chroma = 0; + search_data[index].pred_cu.intra.mode_chroma = candidate_modes[i]; search_data[index].cost = MAX_INT; } if (!skip_rough_search) { @@ -1163,8 +1165,25 @@ void kvz_search_cu_intra( // Check only the predicted modes. number_of_modes_to_search = 0; } - sort_modes(search_data, number_of_modes); + if(!skip_rough_search) { + sort_modes(search_data, number_of_modes); + } + for(int pred_mode = 0; pred_mode < INTRA_MPM_COUNT; ++pred_mode) { + bool mode_found = false; + for(int i = 0; i < number_of_modes_to_search; i++) { + if(search_data[i].pred_cu.intra.mode == candidate_modes[pred_mode]) { + mode_found = true; + break; + } + } + if(!mode_found) { + search_data[number_of_modes_to_search].pred_cu = temp_pred_cu; + search_data[number_of_modes_to_search].pred_cu.intra.mode = candidate_modes[pred_mode]; + search_data[number_of_modes_to_search].pred_cu.intra.mode_chroma = candidate_modes[pred_mode]; + number_of_modes_to_search++; + } + } // TODO: if rough search is implemented for MIP, sort mip_modes here. search_intra_rdo( From b8e5e1d9e321c818f7a41192dbb775e4e80e2180 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 12:39:39 +0300 Subject: [PATCH 121/135] Fix mrl rough search --- src/search_intra.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index c4dfe61e..b8c2ea11 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -1144,12 +1144,12 @@ void kvz_search_cu_intra( search_data[index].pred_cu.intra.mode_chroma = candidate_modes[i]; search_data[index].cost = MAX_INT; } - if (!skip_rough_search) { - get_rough_cost_for_n_modes(state, &refs, &cu_loc, - ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mrl_modes); - } - number_of_modes += num_mrl_modes; } + if (!skip_rough_search && lines != 1) { + get_rough_cost_for_n_modes(state, &refs, &cu_loc, + ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mrl_modes); + } + number_of_modes += num_mrl_modes; // Set transform depth to current depth, meaning no transform splits. kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); From 69dfd816f1b4354fac5fde3a5a0fe5df9a7f93d3 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 12:50:50 +0300 Subject: [PATCH 122/135] oops --- src/cabac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cabac.h b/src/cabac.h index 65de6f92..eff15220 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -159,7 +159,7 @@ extern const float kvz_f_entropy_bits[512]; #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] #define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ - if((cabac)->only_count || 1) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \ + if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \ if((cabac)->update) {\ (cabac)->cur_ctx = ctx;\ CABAC_BIN((cabac), (val), (name));\ From 43c2f9318e6d0dfd1513822c971a08685fcce313 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 12:54:30 +0300 Subject: [PATCH 123/135] Fix mip mode generation --- src/search_intra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_intra.c b/src/search_intra.c index b8c2ea11..ec5bc3a0 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -1118,7 +1118,7 @@ void kvz_search_cu_intra( search_data[index].pred_cu.intra.mip_flag = 1; search_data[index].pred_cu.intra.mode = i; search_data[index].pred_cu.intra.mip_is_transposed = transpose; - search_data[index].pred_cu.intra.mode_chroma = 0; + search_data[index].pred_cu.intra.mode_chroma = i; search_data[index].cost = MAX_INT; } } From 2c48453e59d70ccae39303bed3aca089b99e6816 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 13:44:09 +0300 Subject: [PATCH 124/135] Fix mip mode count macros --- src/search.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search.h b/src/search.h index 919b6cab..564c69ca 100644 --- a/src/search.h +++ b/src/search.h @@ -73,8 +73,8 @@ typedef struct unit_stats_map_t { int size; //!< number of active elements in the lists } unit_stats_map_t; -#define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12) -#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1 +#define NUM_MIP_MODES_FULL(width, height) (((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)) +#define NUM_MIP_MODES_HALF(width, height) (NUM_MIP_MODES_FULL((width), (height)) >> 1) void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length); From abcf2a12b8db11a6846044e94a408cc917699094 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 13 Apr 2022 13:52:01 +0300 Subject: [PATCH 125/135] Fix setting chroma mode to planar for mip pus --- src/intra.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/intra.c b/src/intra.c index f2fc2658..907826c6 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1392,6 +1392,7 @@ void kvz_intra_predict( } else { use_mip = state->encoder_control->chroma_format == KVZ_CSP_444; + intra_mode = use_mip ? intra_mode : 0; } } if (intra_mode < 68) { From 901d60b8e7e3e0274e890b2adcdff65ba9ba1212 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 14 Apr 2022 07:53:13 +0300 Subject: [PATCH 126/135] Fix monochrome, maybe mts search and cabac debug --- src/cabac.h | 16 ++++++++-------- src/search_intra.c | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index eff15220..0526b0b3 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -198,23 +198,23 @@ extern uint32_t kvz_cabac_bins_count; extern bool kvz_cabac_bins_verbose; #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = CTX_STATE(data->cur_ctx); \ - if(kvz_cabac_bins_verbose && !data->only_count) {printf("%d %d [%d:%d] %s = %u, range = %u LPS = %u state = %u -> ", \ - kvz_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS(data->cur_ctx,(data)->range), CTX_LPS(data->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS(data->cur_ctx,(data)->range), prev_state); }\ + if(kvz_cabac_bins_verbose && !(data)->only_count) {printf("%d %d [%d:%d] %s = %u, range = %u LPS = %u state = %u -> ", \ + kvz_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS((data)->cur_ctx,(data)->range), CTX_LPS((data)->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS((data)->cur_ctx,(data)->range), prev_state); }\ kvz_cabac_encode_bin((data), (value)); \ - if(kvz_cabac_bins_verbose && !data->only_count) printf("%u\n", CTX_STATE(data->cur_ctx)); } + if(kvz_cabac_bins_verbose && !(data)->only_count) printf("%u\n", CTX_STATE((data)->cur_ctx)); } #define CABAC_BINS_EP(data, value, bins, name) { \ - uint32_t prev_state = CTX_STATE(data->cur_ctx); \ + uint32_t prev_state = (!(data)->only_count) ? CTX_STATE(data->cur_ctx) : 0; \ kvz_cabac_encode_bins_ep((data), (value), (bins)); \ if(kvz_cabac_bins_verbose && !data->only_count) { printf("%d %s = %u(%u bins), state = %u -> %u\n", \ - kvz_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE(data->cur_ctx)); kvz_cabac_bins_count+=bins;}} + kvz_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE((data)->cur_ctx)); kvz_cabac_bins_count+=(bins);}} #define CABAC_BIN_EP(data, value, name) { \ - uint32_t prev_state = CTX_STATE(data->cur_ctx); \ + uint32_t prev_state = (!(data)->only_count) ? CTX_STATE((data)->cur_ctx) : 0;; \ kvz_cabac_encode_bin_ep((data), (value)); \ - if(kvz_cabac_bins_verbose && !data->only_count) {printf("%d %s = %u, state = %u -> %u\n", \ - kvz_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE(data->cur_ctx)); }} + if(kvz_cabac_bins_verbose && !(data)->only_count) {printf("%d %s = %u, state = %u -> %u\n", \ + kvz_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE((data)->cur_ctx)); }} #else #define CABAC_BIN(data, value, name) \ kvz_cabac_encode_bin((data), (value)); diff --git a/src/search_intra.c b/src/search_intra.c index ec5bc3a0..218ed71c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -319,7 +319,7 @@ static double search_intra_trdepth( if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) { num_transforms = MAX(num_transforms, 2); } - + pred_cu->intra.mode_chroma = -1; for (; trafo < num_transforms; trafo++) { pred_cu->tr_idx = trafo; if (mts_enabled) @@ -818,6 +818,7 @@ static int8_t search_intra_rdo( for (int mode = 0; mode < modes_to_check; mode++) { double rdo_bitcost = kvz_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu); + search_data[mode].pred_cu.tr_idx = MTS_TR_NUM; search_data[mode].bits = rdo_bitcost; search_data[mode].cost = rdo_bitcost * state->lambda; From 24faf0024d0c2b9dc01ec980be2871c13dfa75d0 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 14 Apr 2022 09:49:18 +0300 Subject: [PATCH 127/135] Fix MTS and actually select the best intra mode for rd < 2 --- src/search_intra.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index 218ed71c..fd1ba01d 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -346,7 +346,6 @@ static double search_intra_trdepth( derive_mts_constraints(pred_cu, lcu, depth, lcu_px); if (pred_cu->violates_mts_coeff_constraint || !pred_cu->mts_last_scan_pos) { - assert(pred_cu->tr_idx == MTS_TR_NUM); //mts mode should not be decided and then not allowed to be used. (might be some exception here) continue; } } @@ -1195,8 +1194,12 @@ void kvz_search_cu_intra( number_of_modes_to_search, search_data, lcu); - + // Reset these + search_data[0].pred_cu.violates_mts_coeff_constraint = false; + search_data[0].pred_cu.mts_last_scan_pos = false; + } + else { + sort_modes(search_data, number_of_modes); } - *mode_out = search_data[0]; } From f4dc3ab43b4aa2a9de26007867d9d2b4d01c32ac Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 14 Apr 2022 11:29:56 +0300 Subject: [PATCH 128/135] Fix jccr and chroma mode search --- src/intra.c | 2 +- src/search.c | 24 ++++++++++++++++++------ src/search_intra.c | 21 +++++++++++++++------ src/search_intra.h | 2 +- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/intra.c b/src/intra.c index 907826c6..88849c4e 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1589,7 +1589,7 @@ void kvz_intra_recon_cu( intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data); } - kvz_quantize_lcu_residual(state, has_luma, has_chroma, + kvz_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3), search_data->pred_cu.joint_cb_cr != 4 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0), x, y, depth, cur_cu, lcu, false); } diff --git a/src/search.c b/src/search.c index a3fcbc93..3686da07 100644 --- a/src/search.c +++ b/src/search.c @@ -996,21 +996,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. + cur_cu->joint_cb_cr = 0; + intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; // skip luma if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) { - cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, intra_search.cclm_parameters); + cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search); + + if (intra_search.pred_cu.joint_cb_cr == 0) intra_search.pred_cu.joint_cb_cr = 4; + else cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr; + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } - intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode; intra_search.pred_cu.intra.mode = -1; // skip luma - intra_search.pred_cu.joint_cb_cr = 0; kvz_intra_recon_cu(state, x & ~7, y & ~7, // TODO: as does this depth, &intra_search, NULL, lcu); - cur_cu->intra.mode_chroma = intra_search.pred_cu.intra.mode_chroma; - cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr; - if(depth != 0 && state->encoder_control->cfg.jccr) { + if(depth != 0 && state->encoder_control->cfg.jccr && ctrl->cfg.rdo < 3) { kvz_select_jccr_mode(state, x & ~7, y & ~7, depth, @@ -1018,6 +1020,16 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, lcu, NULL); } + else if(depth != 0 && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr & 3) { + assert(cur_cu->joint_cb_cr < 4); + const vector2d_t lcu_px = { (x_local & ~7) / 2, (y_local & ~7) / 2 }; + int lcu_width = LCU_WIDTH_C; + const int index = lcu_px.x + lcu_px.y * lcu_width; + const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + kvz_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width); + kvz_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width); + + } } } else if (cur_cu->type == CU_INTER) { diff --git a/src/search_intra.c b/src/search_intra.c index fd1ba01d..5db56df4 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -927,14 +927,14 @@ int8_t kvz_search_intra_chroma_rdo( kvz_intra_recon_cu(state, x_px, y_px, depth, &chroma_data[i], - NULL, + &chroma_data[i].pred_cu, lcu); } if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) { - chroma_data[i].cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + chroma_data[i].cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu); } else { - kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu, &chroma_data[i].cost); + kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu, &chroma_data[i].cost); } double mode_bits = kvz_chroma_mode_bits(state, mode, chroma_data[i].pred_cu.intra.mode); @@ -951,7 +951,7 @@ int8_t kvz_search_intra_chroma_rdo( int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, const int x_px, const int y_px, - const int depth, lcu_t *lcu, cclm_parameters_t *best_cclm) + const int depth, lcu_t *lcu, intra_search_data_t *search_data) { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; @@ -987,6 +987,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, for (int i = 0; i < num_modes; i++) { chroma_data[i].pred_cu = *cur_pu; chroma_data[i].pred_cu.intra.mode_chroma = modes[i]; + chroma_data[i].pred_cu.intra.mode = -1; } // Don't do rough mode search if all modes are selected. // FIXME: It might make more sense to only disable rough search if @@ -1019,7 +1020,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, if (num_modes > 1) { intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data); } - + *search_data = chroma_data[0]; return intra_mode_chroma; } @@ -1199,7 +1200,15 @@ void kvz_search_cu_intra( search_data[0].pred_cu.mts_last_scan_pos = false; } else { - sort_modes(search_data, number_of_modes); + double best_cost = MAX_INT; + int best_mode = 0; + for (int mode = 0; mode < number_of_modes; mode++) { + if (search_data[mode].cost < best_cost) { + best_cost = search_data[mode].cost; + best_mode = mode; + } + } + search_data[0] = search_data[best_mode]; } *mode_out = search_data[0]; } diff --git a/src/search_intra.h b/src/search_intra.h index 33df7f2e..e7d0da42 100644 --- a/src/search_intra.h +++ b/src/search_intra.h @@ -50,7 +50,7 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, const int x_px, const int y_px, - const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm); + const int depth, lcu_t *lcu, intra_search_data_t* best_cclm); void kvz_search_cu_intra( encoder_state_t * const state, From b9618690e72c69ef9113c541d677cad16b77bc4b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 15 Apr 2022 08:55:02 +0300 Subject: [PATCH 129/135] Generate mrl references for mrl rough search --- src/search_intra.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index 5db56df4..286bad96 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -768,7 +768,7 @@ static void get_rough_cost_for_n_modes( double costs_out[PARALLEL_BLKS] = { 0 }; for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) { for (int i = 0; i < PARALLEL_BLKS; ++i) { - kvz_intra_predict(state, refs, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL); + kvz_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL); } get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); search_data[mode].cost = costs_out[0]; @@ -1042,10 +1042,12 @@ void kvz_search_cu_intra( const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width, MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) }; const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth; + const vector2d_t luma_px = { x_px, y_px }; + const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - kvz_intra_references refs; + kvz_intra_references refs[MAX_REF_LINE_IDX]; int8_t candidate_modes[INTRA_MPM_COUNT]; // Normal intra modes + mrl modes + mip modes @@ -1065,11 +1067,7 @@ void kvz_search_cu_intra( kvz_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu); if (depth > 0) { - const vector2d_t luma_px = { x_px, y_px }; - const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; - - // These references will only be used with rough search. No need for MRL stuff here. - kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs, state->encoder_control->cfg.wpp, NULL, 0); + kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0); } // The maximum number of possible MIP modes depend on block size & shape @@ -1091,7 +1089,7 @@ void kvz_search_cu_intra( number_of_modes = search_intra_rough(state, ref_pixels, LCU_WIDTH, - &refs, + refs, log2_width, candidate_modes, search_data, &temp_pred_cu); @@ -1124,7 +1122,7 @@ void kvz_search_cu_intra( } } if(!skip_rough_search) { - get_rough_cost_for_n_modes(state, &refs, &cu_loc, + get_rough_cost_for_n_modes(state, refs, &cu_loc, ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mip_modes); } } @@ -1136,6 +1134,23 @@ void kvz_search_cu_intra( uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1; for(int line = 1; line < lines; ++line) { + kvz_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 }; + + if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0) { + videoframe_t* const frame = state->tile->frame; + + // Copy extra ref lines, including ref line 1 and top left corner. + for (int i = 0; i < MAX_REF_LINE_IDX; ++i) { + int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX; + height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist. + height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX); + kvz_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)], + &extra_refs[i * 128], + 1, height, + frame->rec->stride, 1); + } + } + kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line); for(int i = 1; i < INTRA_MPM_COUNT; i++) { num_mrl_modes++; const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes; @@ -1147,7 +1162,7 @@ void kvz_search_cu_intra( } } if (!skip_rough_search && lines != 1) { - get_rough_cost_for_n_modes(state, &refs, &cu_loc, + get_rough_cost_for_n_modes(state, refs, &cu_loc, ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mrl_modes); } number_of_modes += num_mrl_modes; From 61a3612395b5c497a4b42a9fd8ce21791d6543d2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 15 Apr 2022 11:20:20 +0300 Subject: [PATCH 130/135] Fix incorrect bit counting --- src/search_intra.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index 286bad96..705f2c28 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -713,11 +713,11 @@ static int16_t search_intra_rough( int i = 0; int smaller_than_pred = 0; double bits; - for(; i < INTRA_MPM_COUNT; i++) { + for (; i < INTRA_MPM_COUNT; i++) { if (intra_preds[i] == modes[mode_i]) { break; } - if(modes[mode_i] > intra_preds[i]) { + if (modes[mode_i] > intra_preds[i]) { smaller_than_pred += 1; } } @@ -728,7 +728,7 @@ static int16_t search_intra_rough( bits = not_planar_mode_flag + mpm_mode_bit + MAX(i, 3); } else { - bits = not_mpm_mode_bit + 5 + (mode_i - smaller_than_pred > 3); + bits = not_mpm_mode_bit + 5 + (modes[mode_i] - smaller_than_pred > 3); } costs[mode_i] += state->lambda_sqrt * bits; modes_out[mode_i].cost = costs[mode_i]; From e9a081de4f3ddcc2163fa48d9d3fe6f22dbf6289 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 15 Apr 2022 12:25:07 +0300 Subject: [PATCH 131/135] Count bits for mip and mrl --- src/search_intra.c | 57 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index 705f2c28..1dca3bf6 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -565,7 +565,8 @@ static int16_t search_intra_rough( int log2_width, int8_t *intra_preds, intra_search_data_t* modes_out, - cu_info_t* const pred_cu) + cu_info_t* const pred_cu, + uint8_t mip_ctx) { #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future? assert(log2_width >= 2 && log2_width <= 5); @@ -705,6 +706,8 @@ static int16_t search_intra_rough( // Add prediction mode coding cost as the last thing. We don't want this // affecting the halving search. + const double not_mrl = state->encoder_control->cfg.mrl ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 0) : 0; + const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0; const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1); const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0); const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1); @@ -725,11 +728,12 @@ static int16_t search_intra_rough( bits = planar_mode_flag + mpm_mode_bit; } else if (i < INTRA_MPM_COUNT) { - bits = not_planar_mode_flag + mpm_mode_bit + MAX(i, 3); + bits = not_planar_mode_flag + mpm_mode_bit + MAX(i, 4); } else { bits = not_mpm_mode_bit + 5 + (modes[mode_i] - smaller_than_pred > 3); } + bits += not_mrl + not_mip; costs[mode_i] += state->lambda_sqrt * bits; modes_out[mode_i].cost = costs[mode_i]; modes_out[mode_i].pred_cu = *pred_cu; @@ -742,17 +746,18 @@ static int16_t search_intra_rough( } -static void get_rough_cost_for_n_modes( +static void get_rough_cost_for_2n_modes( encoder_state_t* const state, kvz_intra_references* refs, const cu_loc_t* const cu_loc, kvz_pixel *orig, int orig_stride, intra_search_data_t *search_data, - int num_modes) + int num_modes, + uint8_t mip_ctx) { #define PARALLEL_BLKS 2 - assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_n_modes"); + assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_2n_modes"); const int width = cu_loc->width; cost_pixel_nxn_multi_func* satd_dual_func = kvz_pixels_get_satd_dual_func(width); cost_pixel_nxn_multi_func* sad_dual_func = kvz_pixels_get_sad_dual_func(width); @@ -765,14 +770,37 @@ static void get_rough_cost_for_n_modes( kvz_pixels_blit(orig, orig_block, width, width, orig_stride, width); + const double mrl = state->encoder_control->cfg.mrl ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 1) : 0; + const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0; + const double mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 1) : 0; double costs_out[PARALLEL_BLKS] = { 0 }; + double bits[PARALLEL_BLKS] = { 0 }; for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) { for (int i = 0; i < PARALLEL_BLKS; ++i) { kvz_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL); } get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); + + for(int i = 0; i < PARALLEL_BLKS; ++i) { + uint8_t multi_ref_idx = search_data[mode + i].pred_cu.intra.multi_ref_idx; + if(multi_ref_idx) { + bits[i] = mrl + not_mip; + bits[i] += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[1]), multi_ref_idx != 1); + bits[i] += MIN((mode + i + 1) % 6, 4); + } + else if(search_data[mode + i].pred_cu.intra.mip_flag) { + bits[i] = mip + 1; + bits[i] += num_modes == 32 ? 4 : (num_modes == 16 ? 3 : (((mode + i) % 6) < 2 ? 2 : 3)); + } + else { + assert(0 && "get_rough_cost_for_2n_modes supports only mrl and mip mode cost calculation"); + } + } search_data[mode].cost = costs_out[0]; search_data[mode + 1].cost = costs_out[1]; + + search_data[mode].cost += bits[0] * state->lambda_sqrt; + search_data[mode + 1].cost += bits[1] * state->lambda_sqrt; } #undef PARALLEL_BLKS } @@ -1074,6 +1102,10 @@ void kvz_search_cu_intra( int width = LCU_WIDTH >> depth; int height = width; // TODO: proper height for non-square blocks. + // This is needed for bit cost calculation and requires too many parameters to be + // calculated inside the rough search functions + uint8_t mip_ctx = kvz_get_mip_flag_context(x_px, y_px, cu_width, cu_width, lcu, NULL); + // Find best intra mode for 2Nx2N. kvz_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH]; @@ -1091,7 +1123,8 @@ void kvz_search_cu_intra( LCU_WIDTH, refs, log2_width, candidate_modes, - search_data, &temp_pred_cu); + search_data, &temp_pred_cu, + mip_ctx); } else { for (int8_t i = 0; i < KVZ_NUM_INTRA_MODES; i++) { @@ -1122,8 +1155,10 @@ void kvz_search_cu_intra( } } if(!skip_rough_search) { - get_rough_cost_for_n_modes(state, refs, &cu_loc, - ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mip_modes); + get_rough_cost_for_2n_modes(state, refs, &cu_loc, + ref_pixels, + LCU_WIDTH, search_data + number_of_modes, num_mip_modes, + mip_ctx); } } number_of_modes += num_mip_modes; @@ -1162,8 +1197,10 @@ void kvz_search_cu_intra( } } if (!skip_rough_search && lines != 1) { - get_rough_cost_for_n_modes(state, refs, &cu_loc, - ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mrl_modes); + get_rough_cost_for_2n_modes(state, refs, &cu_loc, + ref_pixels, + LCU_WIDTH, search_data + number_of_modes, num_mrl_modes, + mip_ctx); } number_of_modes += num_mrl_modes; From d41103385a15b0bde4627b82786380a4d9ef8a68 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 15 Apr 2022 12:39:18 +0300 Subject: [PATCH 132/135] fix cclm --- src/search_intra.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index 1dca3bf6..1aa0f361 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -951,13 +951,11 @@ int8_t kvz_search_intra_chroma_rdo( for (int8_t i = 0; i < num_modes; ++i) { const uint8_t mode = chroma_data[i].pred_cu.intra.mode_chroma; - if(mode < 67 || depth == 0) { - kvz_intra_recon_cu(state, - x_px, y_px, - depth, &chroma_data[i], - &chroma_data[i].pred_cu, - lcu); - } + kvz_intra_recon_cu(state, + x_px, y_px, + depth, &chroma_data[i], + &chroma_data[i].pred_cu, + lcu); if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) { chroma_data[i].cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu); From b413aa5c438022c93ca9d8445ce99f407d31f648 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 20 Apr 2022 08:12:42 +0300 Subject: [PATCH 133/135] Improve jccr search --- src/encode_coding_tree.c | 4 +- src/encoder_state-bitstream.c | 2 +- src/encoderstate.c | 33 ++++++++++++ src/encoderstate.h | 1 + src/search.c | 25 +++++++--- src/search_intra.c | 1 + src/strategies/generic/quant-generic.c | 69 +++++++++++++------------- 7 files changed, 90 insertions(+), 45 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index e6f39926..4884e3ba 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -507,8 +507,8 @@ static void encode_transform_coeff(encoder_state_t * const state, const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y); - const int cb_flag_u = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U); - const int cb_flag_v = cur_pu->joint_cb_cr ? ((cur_pu->joint_cb_cr & 2) >> 1) : cbf_is_set(cur_cu->cbf, depth, COLOR_V); + const int cb_flag_u = cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V); // The split_transform_flag is not signaled when: // - transform size is greater than 32 (depth == 0) diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 2f24894e..3c0b6b15 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -1125,7 +1125,7 @@ static void kvz_encoder_state_write_bitstream_picture_header( } if (encoder->cfg.jccr) { - WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag"); + WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag"); } // END PICTURE HEADER diff --git a/src/encoderstate.c b/src/encoderstate.c index db5b93f3..05be79ea 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -634,6 +634,38 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las } } + +static void set_joint_cb_cr_modes(encoder_state_t* state, kvz_picture* pic) +{ + bool sgnFlag = true; + + if (state->encoder_control->chroma_format != KVZ_CSP_400) + { + const int x1 = pic->width / 2 - 1; + const int y1 = pic->height / 2 - 1; + const int cbs = pic->stride / 2; + const int crs = pic->stride / 2; + const kvz_pixel* p_cb = pic->u + 1 * cbs; + const kvz_pixel* p_cr = pic->v + 1 * crs; + int64_t sum_cb_cr = 0; + + // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes + for (int y = 1; y < y1; y++, p_cb += cbs, p_cr += crs) + { + for (int x = 1; x < x1; x++) + { + int cb = (12 * (int)p_cb[x] - 2 * ((int)p_cb[x - 1] + (int)p_cb[x + 1] + (int)p_cb[x - cbs] + (int)p_cb[x + cbs]) - ((int)p_cb[x - 1 - cbs] + (int)p_cb[x + 1 - cbs] + (int)p_cb[x - 1 + cbs] + (int)p_cb[x + 1 + cbs])); + int cr = (12 * (int)p_cr[x] - 2 * ((int)p_cr[x - 1] + (int)p_cr[x + 1] + (int)p_cr[x - crs] + (int)p_cr[x + crs]) - ((int)p_cr[x - 1 - crs] + (int)p_cr[x + 1 - crs] + (int)p_cr[x - 1 + crs] + (int)p_cr[x + 1 + crs])); + sum_cb_cr += cb * cr; + } + } + + sgnFlag = (sum_cb_cr < 0); + } + + state->frame->jccr_sign = sgnFlag; +} + static void encoder_state_worker_encode_lcu_bitstream(void* opaque); static void encoder_state_worker_encode_lcu_search(void * opaque) @@ -1870,6 +1902,7 @@ void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame) encoder_state_init_new_frame(state, frame); + if(state->encoder_control->cfg.jccr) set_joint_cb_cr_modes(state, frame); // Create a separate job for ALF done after everything else, and only then do final bitstream writing (for ALF parameters) if (state->encoder_control->cfg.alf_type && state->encoder_control->cfg.wpp) { diff --git a/src/encoderstate.h b/src/encoderstate.h index 19c0d196..edfc6a38 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -195,6 +195,7 @@ typedef struct encoder_state_config_frame_t { cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row uint8_t* hmvp_size; //!< \brief HMVP LUT size + bool jccr_sign; } encoder_state_config_frame_t; diff --git a/src/search.c b/src/search.c index 3686da07..a474d4c5 100644 --- a/src/search.c +++ b/src/search.c @@ -637,16 +637,17 @@ void kvz_select_jccr_mode( int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search"); - int cbf_mask = cbf_is_set(pred_cu->cbf, depth, COLOR_U) * 2 + cbf_is_set(pred_cu->cbf, depth, COLOR_V) - 1; + int cbf_mask = u_is_set * 2 + v_is_set - 1; if((cbf_mask != -1 && pred_cu->type == CU_INTRA) || cbf_mask == 2) CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag"); if(pred_cu->joint_cb_cr) { + const int u_jccr = (pred_cu->joint_cb_cr >> 1) & 1; ctx = &(cabac->ctx.qt_cbf_model_cb[0]); - CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cb_search"); - ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); - CABAC_FBITS_UPDATE(cabac, ctx, (pred_cu->joint_cb_cr & 2) >> 1, joint_cbcr_tr_tree_bits, "cbf_cr_search"); - cbf_mask = (pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1; + CABAC_FBITS_UPDATE(cabac, ctx, u_jccr, joint_cbcr_tr_tree_bits, "cbf_cb_search"); + ctx = &(cabac->ctx.qt_cbf_model_cr[u_jccr]); + CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cr_search"); + cbf_mask = pred_cu->joint_cb_cr - 1; CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag"); } int ssd = 0; @@ -695,10 +696,10 @@ void kvz_select_jccr_mode( } cbf_clear(&pred_cu->cbf, depth, COLOR_U); cbf_clear(&pred_cu->cbf, depth, COLOR_V); - if (pred_cu->joint_cb_cr & 1) { + if (pred_cu->joint_cb_cr & 2) { cbf_set(&pred_cu->cbf, depth, COLOR_U); } - if (pred_cu->joint_cb_cr & 2) { + if (pred_cu->joint_cb_cr & 1) { cbf_set(&pred_cu->cbf, depth, COLOR_V); } int lcu_width = LCU_WIDTH_C; @@ -989,6 +990,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, downsample_cclm_rec( state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] ); + cur_cu->joint_cb_cr = 0; // TODO: This heavily relies to square CUs if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) { @@ -996,7 +998,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - cur_cu->joint_cb_cr = 0; intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; // skip luma if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) { cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search); @@ -1022,6 +1023,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if(depth != 0 && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr & 3) { assert(cur_cu->joint_cb_cr < 4); + cbf_clear(&cur_cu->cbf, depth, COLOR_U); + cbf_clear(&cur_cu->cbf, depth, COLOR_V); + if (cur_cu->joint_cb_cr & 2) { + cbf_set(&cur_cu->cbf, depth, COLOR_U); + } + if (cur_cu->joint_cb_cr & 1) { + cbf_set(&cur_cu->cbf, depth, COLOR_V); + } const vector2d_t lcu_px = { (x_local & ~7) / 2, (y_local & ~7) / 2 }; int lcu_width = LCU_WIDTH_C; const int index = lcu_px.x + lcu_px.y * lcu_width; diff --git a/src/search_intra.c b/src/search_intra.c index 1aa0f361..7a8eb41b 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -320,6 +320,7 @@ static double search_intra_trdepth( num_transforms = MAX(num_transforms, 2); } pred_cu->intra.mode_chroma = -1; + pred_cu->joint_cb_cr = 4; for (; trafo < num_transforms; trafo++) { pred_cu->tr_idx = trafo; if (mts_enabled) diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index 41ce1d58..5601106f 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -225,39 +225,40 @@ int kvz_quant_cbcr_residual_generic( int64_t best_cost = INT64_MAX; // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM - for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) { + for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) { int64_t d1 = 0; + const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1); for (int y = 0; y < width; y++) { for (int x = 0; x < width; x++) { int cbx = u_residual[x + y * width], crx = v_residual[x + y * width]; - if (cbf_mask == 1) + if (cbf_mask == 2) { - u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1)); + u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1)); } - else if (cbf_mask == -1) + else if (cbf_mask == -2) { - u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1)); + u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1)); } else if (cbf_mask == 3) { - u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]); + u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]); } else if (cbf_mask == -3) { - u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]); + u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]); } - else if (cbf_mask == 2) + else if (cbf_mask == 1) { v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5); d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]); } - else if (cbf_mask == -2) + else if (cbf_mask == -1) { v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5); d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]); @@ -270,19 +271,19 @@ int kvz_quant_cbcr_residual_generic( } } if (d1 < best_cost) { - best_cbf_mask = cbf_mask; + best_cbf_mask = i; best_cost = d1; } } - kvz_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu); + kvz_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu); if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip)) { int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); - kvz_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + kvz_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, tr_depth, cur_cu->cbf); } else if (state->encoder_control->cfg.rdoq_enable && false) { @@ -290,7 +291,7 @@ int kvz_quant_cbcr_residual_generic( scan_order); } else { - kvz_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + kvz_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); } @@ -309,10 +310,10 @@ int kvz_quant_cbcr_residual_generic( int y, x; // Get quantized residual. (coeff_out -> coeff -> residual) - kvz_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + kvz_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); - kvz_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu); + kvz_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu); //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { @@ -333,32 +334,32 @@ int kvz_quant_cbcr_residual_generic( // } // } //} - + const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1); // Get quantized reconstruction. (residual + pred_in -> rec_out) for (int y = 0; y < width; y++) { for (int x = 0; x < width; x++) { - if (best_cbf_mask == 1) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1; + if (temp == 2) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1; } - else if (best_cbf_mask == -1) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1; + else if (temp == -2) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1; } - else if (best_cbf_mask == 3) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; + else if (temp == 3) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; } - else if (best_cbf_mask == -3) { + else if (temp == -3) { // non-normative clipping to prevent 16-bit overflow - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x]; - v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width]; + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x]; + v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width]; } - else if (best_cbf_mask == 2) { + else if (temp == 1) { u_residual[x + y * width] = v1_residual[x + y * width] >> 1; v_residual[x + y * width] = v1_residual[x + y * width]; } - else if (best_cbf_mask == -2) { + else if (temp == -1) { u_residual[x + y * width] = v1_residual[x + y * width] >> 1; v_residual[x + y * width] = -v1_residual[x + y * width]; } From 0381d7d7793edb4485b4d7362a2817036f3907b9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 21 Apr 2022 09:46:54 +0300 Subject: [PATCH 134/135] Move 4x4 chroma tu info to bottom right cu when intra is split into 4x4 --- src/encode_coding_tree.c | 14 ++++++-------- src/intra.c | 5 +++++ src/search.c | 22 +++++++++++----------- src/search_intra.c | 2 +- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 4884e3ba..e0459239 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -352,8 +352,8 @@ void kvz_encode_last_significant_xy(cabac_data_t * const cabac, } static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff, uint8_t joint_chroma) { - int x_local = (x >> 1) % LCU_WIDTH_C; - int y_local = (y >> 1) % LCU_WIDTH_C; + int x_local = ((x & ~7) >> 1) % LCU_WIDTH_C; + int y_local = ((y & ~7) >> 1) % LCU_WIDTH_C; cabac_data_t* const cabac = &state->cabac; *scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth); if(!joint_chroma){ @@ -367,7 +367,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep // TODO: transform skip for chroma blocks CABAC_BIN(cabac, 0, "transform_skip_flag"); } - kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false); + kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, NULL, false); } if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) { @@ -375,7 +375,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; CABAC_BIN(cabac, 0, "transform_skip_flag"); } - kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false); + kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, NULL, false); } } else { @@ -384,7 +384,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; CABAC_BIN(cabac, 0, "transform_skip_flag"); } - kvz_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, 2, *scan_idx, NULL, false); + kvz_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, NULL, false); } } @@ -444,8 +444,6 @@ static void encode_transform_unit(encoder_state_t * const state, } else { // Time to to code the chroma transform blocks. Move to the top-left // corner of the block. - x -= 4; - y -= 4; cur_pu = kvz_cu_array_at_const((const cu_array_t *)frame->cu_array, x, y); } } @@ -485,7 +483,7 @@ static void encode_transform_coeff(encoder_state_t * const state, // containing CU. const int x_cu = 8 * (x / 8); const int y_cu = 8 * (y / 8); - const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x_cu, y_cu); + const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x, y); // NxN signifies implicit transform split at the first transform level. // There is a similar implicit split for inter, but it is only used when diff --git a/src/intra.c b/src/intra.c index 88849c4e..c1b0b095 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1534,6 +1534,11 @@ void kvz_intra_recon_cu( } const int8_t mode_luma = search_data->pred_cu.intra.mode; const int8_t mode_chroma= search_data->pred_cu.intra.mode_chroma; + + if(mode_chroma != -1 && mode_luma == -1) { + x &= ~7; + y &= ~7; + } if (mode_luma != -1 && mode_chroma != -1) { if (search_data->pred_cu.intra.mip_flag) { diff --git a/src/search.c b/src/search.c index a474d4c5..8d93390a 100644 --- a/src/search.c +++ b/src/search.c @@ -608,7 +608,7 @@ void kvz_select_jccr_mode( { const vector2d_t lcu_px = { (SUB_SCU(x_px) & ~7) / 2, (SUB_SCU(y_px) & ~7) / 2 }; const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; - if (pred_cu == NULL) pred_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x * 2, lcu_px.y * 2); + if (pred_cu == NULL) pred_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x_px), SUB_SCU(y_px)); assert(pred_cu->depth == pred_cu->tr_depth && "jccr does not support transform splitting"); if (cost_out == NULL && pred_cu->joint_cb_cr == 0) { return; @@ -650,23 +650,23 @@ void kvz_select_jccr_mode( cbf_mask = pred_cu->joint_cb_cr - 1; CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag"); } - int ssd = 0; - int joint_ssd = 0; + unsigned ssd = 0; + unsigned joint_ssd = 0; if (!state->encoder_control->cfg.lossless) { - int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; - int ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + const int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + const unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, width); - int ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + const unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, width); ssd = ssd_u + ssd_v; if (pred_cu->joint_cb_cr) { - int ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], + const unsigned ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], LCU_WIDTH_C, LCU_WIDTH_C, width); - int ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], + const unsigned ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], LCU_WIDTH_C, LCU_WIDTH_C, width); joint_ssd = ssd_u_joint + ssd_v_joint; @@ -1009,13 +1009,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } intra_search.pred_cu.intra.mode = -1; // skip luma kvz_intra_recon_cu(state, - x & ~7, y & ~7, // TODO: as does this + x, y, // TODO: as does this depth, &intra_search, NULL, lcu); if(depth != 0 && state->encoder_control->cfg.jccr && ctrl->cfg.rdo < 3) { kvz_select_jccr_mode(state, - x & ~7, y & ~7, + x, y, depth, NULL, lcu, @@ -1074,7 +1074,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, false); if (cur_cu->depth == cur_cu->tr_depth && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr) { kvz_select_jccr_mode(state, - x & ~7, y & ~7, + x, y, depth, NULL, lcu, diff --git a/src/search_intra.c b/src/search_intra.c index 7a8eb41b..7c9ea40c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -367,7 +367,7 @@ static double search_intra_trdepth( pred_cu->intra.mode_chroma = chroma_mode; pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently kvz_intra_recon_cu(state, - x_px & ~7, y_px & ~7, + x_px, y_px, depth, search_data, pred_cu, lcu); From ede7603361377404c06ef0567be23a09f438ff8e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 21 Apr 2022 11:17:44 +0300 Subject: [PATCH 135/135] Fix chroma bit cost calculation --- src/search_intra.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index 7c9ea40c..f4ff2351 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -930,7 +930,8 @@ int8_t kvz_search_intra_chroma_rdo( int depth, int8_t num_modes, lcu_t *const lcu, - intra_search_data_t* chroma_data) + intra_search_data_t* chroma_data, + int8_t luma_mode) { const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4); @@ -964,7 +965,7 @@ int8_t kvz_search_intra_chroma_rdo( kvz_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu, &chroma_data[i].cost); } - double mode_bits = kvz_chroma_mode_bits(state, mode, chroma_data[i].pred_cu.intra.mode); + double mode_bits = kvz_chroma_mode_bits(state, mode, luma_mode); chroma_data[i].cost += mode_bits * state->lambda; } sort_modes(chroma_data, num_modes); @@ -1045,7 +1046,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state, int8_t intra_mode_chroma = intra_mode; if (num_modes > 1) { - intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data); + intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode); } *search_data = chroma_data[0]; return intra_mode_chroma;