diff --git a/README.md b/README.md index 8abfb203..49a0af84 100644 --- a/README.md +++ b/README.md @@ -207,6 +207,11 @@ Compression tools: when QP is below the limit. [0] --(no-)intra-rdo-et : Check intra modes in rdo stage only until a zero coefficient CU is found. [disabled] + --(no-)early-skip : Try to find skip cu from merge candidates. + Perform no further search if skip is found. + For rd=0..1: Try the first candidate. + For rd=2.. : Try the best candidate based + on luma satd cost. [enabled] --(no-)implicit-rdpcm : Implicit residual DPCM. Currently only supported with lossless coding. [disabled] --(no-)tmvp : Temporal motion vector prediction [enabled] diff --git a/configure.ac b/configure.ac index 61ffd4e5..022a455c 100644 --- a/configure.ac +++ b/configure.ac @@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c]) # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html ver_major=4 -ver_minor=1 +ver_minor=2 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS diff --git a/doc/kvazaar.1 b/doc/kvazaar.1 index 62d66a6c..773d91c9 100644 --- a/doc/kvazaar.1 +++ b/doc/kvazaar.1 @@ -1,4 +1,4 @@ -.TH KVAZAAR "1" "May 2019" "kvazaar v1.2.0" "User Commands" +.TH KVAZAAR "1" "July 2019" "kvazaar v1.2.0" "User Commands" .SH NAME kvazaar \- open source HEVC encoder .SH SYNOPSIS @@ -278,6 +278,13 @@ Skip CABAC cost for residual coefficients Check intra modes in rdo stage only until a zero coefficient CU is found. [disabled] .TP +\fB\-\-(no\-)early\-skip +Try to find skip cu from merge candidates. +Perform no further search if skip is found. +For rd=0..1: Try the first candidate. +For rd=2.. : Try the best candidate based + on luma satd cost. [enabled] +.TP \fB\-\-(no\-)implicit\-rdpcm Implicit residual DPCM. Currently only supported with lossless coding. [disabled] diff --git a/src/cfg.c b/src/cfg.c index 6cd9a348..08dc25a1 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -139,6 +139,7 @@ int kvz_config_init(kvz_config *cfg) cfg->scaling_list = KVZ_SCALING_LIST_OFF; cfg->max_merge = 5; + cfg->early_skip = true; return 1; } @@ -385,7 +386,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) static const char * const scaling_list_names[] = { "off", "custom", "default", NULL }; - static const char * const preset_values[11][23*2] = { + static const char * const preset_values[11][24*2] = { { "ultrafast", "rd", "0", @@ -409,6 +410,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "sensitive", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "28", NULL }, @@ -435,6 +437,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "sensitive", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "28", NULL }, @@ -461,6 +464,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "sensitive", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "28", NULL }, @@ -487,6 +491,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "sensitive", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -513,6 +518,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "sensitive", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -539,6 +545,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "on", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -565,6 +572,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "on", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -591,6 +599,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "off", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -617,6 +626,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "zero", "me-early-termination", "off", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -643,6 +653,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) "cu-split-termination", "off", "me-early-termination", "off", "intra-rdo-et", "0", + "early-skip", "1", "fast-residual-cost", "0", NULL }, @@ -1236,6 +1247,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) } cfg->max_merge = (uint8_t)max_merge; } + else if OPT("early-skip") { + cfg->early_skip = (bool)atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index 6e6d9c5b..eede568d 100644 --- a/src/cli.c +++ b/src/cli.c @@ -135,6 +135,8 @@ static const struct option long_options[] = { { "no-open-gop", no_argument, NULL, 0 }, { "scaling-list", required_argument, NULL, 0 }, { "max-merge", required_argument, NULL, 0 }, + { "early-skip", no_argument, NULL, 0 }, + { "no-early-skip", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -489,6 +491,11 @@ void print_help(void) " when QP is below the limit. [0]\n" " --(no-)intra-rdo-et : Check intra modes in rdo stage only until\n" " a zero coefficient CU is found. [disabled]\n" + " --(no-)early-skip : Try to find skip cu from merge candidates.\n" + " Perform no further search if skip is found.\n" + " For rd=0..1: Try the first candidate.\n" + " For rd=2.. : Try the best candidate based\n" + " on luma satd cost. [enabled]\n" " --(no-)implicit-rdpcm : Implicit residual DPCM. Currently only supported\n" " with lossless coding. [disabled]\n" " --(no-)tmvp : Temporal motion vector prediction [enabled]\n" diff --git a/src/kvazaar.h b/src/kvazaar.h index ff065d85..96838621 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -387,6 +387,9 @@ typedef struct kvz_config /** \brief Maximum number of merge cadidates */ uint8_t max_merge; + /** \brief Enable Early Skip Mode Decision */ + uint8_t early_skip; + } kvz_config; /** diff --git a/src/search.c b/src/search.c index 3d62f230..4e2051ce 100644 --- a/src/search.c +++ b/src/search.c @@ -403,6 +403,30 @@ static double calc_mode_bits(const encoder_state_t *state, } +/** + * \brief Sort modes and costs to ascending order according to costs. + */ +void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length) +{ + // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about + // 60% of the time, so there should be no need for anything more complex + // than insertion sort. + // Length for merge is 5 or less. + for (uint8_t i = 1; i < length; ++i) { + const double cur_cost = costs[i]; + const int8_t cur_mode = modes[i]; + uint8_t j = i; + while (j > 0 && cur_cost < costs[j - 1]) { + costs[j] = costs[j - 1]; + modes[j] = modes[j - 1]; + --j; + } + costs[j] = cur_cost; + modes[j] = cur_mode; + } +} + + static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) { vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) }; @@ -482,29 +506,31 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->type = CU_INTER; } - // Try SMP and AMP partitioning. - static const part_mode_t mp_modes[] = { - // SMP - SIZE_2NxN, SIZE_Nx2N, - // AMP - SIZE_2NxnU, SIZE_2NxnD, - SIZE_nLx2N, SIZE_nRx2N, - }; + if (!cur_cu->skipped) { + // Try SMP and AMP partitioning. + static const part_mode_t mp_modes[] = { + // SMP + SIZE_2NxN, SIZE_Nx2N, + // AMP + SIZE_2NxnU, SIZE_2NxnD, + SIZE_nLx2N, SIZE_nRx2N, + }; - const int first_mode = ctrl->cfg.smp_enable ? 0 : 2; - const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1; - for (int i = first_mode; i <= last_mode; ++i) { - kvz_search_cu_smp(state, - x, y, - depth, - mp_modes[i], - &work_tree[depth + 1], - &mode_cost, &mode_bitcost); - if (mode_cost < cost) { - cost = mode_cost; - inter_bitcost = mode_bitcost; - // Copy inter prediction info to current level. - copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); + const int first_mode = ctrl->cfg.smp_enable ? 0 : 2; + const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1; + for (int i = first_mode; i <= last_mode; ++i) { + kvz_search_cu_smp(state, + x, y, + depth, + mp_modes[i], + &work_tree[depth + 1], + &mode_cost, &mode_bitcost); + if (mode_cost < cost) { + cost = mode_cost; + inter_bitcost = mode_bitcost; + // Copy inter prediction info to current level. + copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); + } } } } @@ -512,9 +538,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // Try to skip intra search in rd==0 mode. // This can be quite severe on bdrate. It might be better to do this // decision after reconstructing the inter frame. - bool skip_intra = state->encoder_control->cfg.rdo == 0 + bool skip_intra = (state->encoder_control->cfg.rdo == 0 && cur_cu->type != CU_NOTSET - && cost / (cu_width * cu_width) < INTRA_THRESHOLD; + && cost / (cu_width * cu_width) < INTRA_THRESHOLD) + || cur_cu->skipped; int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max; bool can_use_intra = @@ -567,43 +594,47 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, NULL, lcu); } } else if (cur_cu->type == CU_INTER) { - // Reset transform depth because intra messes with them. - // This will no longer be necessary if the transform depths are not shared. - int tr_depth = MAX(1, depth); - if (cur_cu->part_size != SIZE_2Nx2N) { - tr_depth = depth + 1; - } - kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth); - kvz_inter_recon_cu(state, lcu, x, y, cu_width); + if (!cur_cu->skipped) { + // Reset transform depth because intra messes with them. + // This will no longer be necessary if the transform depths are not shared. + int tr_depth = MAX(1, depth); + if (cur_cu->part_size != SIZE_2Nx2N) { + tr_depth = depth + 1; + } + kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth); - if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { - //Calculate cost for zero coeffs - inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda; + kvz_inter_recon_cu(state, lcu, x, y, cu_width); - } + if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { + //Calculate cost for zero coeffs + inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda; - const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - kvz_quantize_lcu_residual(state, - true, has_chroma, - x, y, depth, - NULL, - lcu); + } - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; + kvz_quantize_lcu_residual(state, + true, has_chroma, + x, y, depth, + NULL, + lcu); - if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { - cur_cu->merged = 0; - cur_cu->skipped = 1; - // Selecting skip reduces bits needed to code the CU - if (inter_bitcost > 1) { - inter_bitcost -= 1; + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + + if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->merged = 0; + cur_cu->skipped = 1; + // Selecting skip reduces bits needed to code the CU + if (inter_bitcost > 1) { + inter_bitcost -= 1; + } } } lcu_set_inter(lcu, x_local, y_local, cu_width); lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu); } } + if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); if (state->encoder_control->chroma_format != KVZ_CSP_400) { diff --git a/src/search.h b/src/search.h index cdaa63be..8a51ba42 100644 --- a/src/search.h +++ b/src/search.h @@ -31,6 +31,7 @@ #include "global.h" // IWYU pragma: keep #include "image.h" +void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); diff --git a/src/search_inter.c b/src/search_inter.c index 35cc6a93..2762e37b 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1510,6 +1510,90 @@ static void search_pu_inter(encoder_state_t * const state, CU_SET_MV_CAND(cur_cu, 0, 0); CU_SET_MV_CAND(cur_cu, 1, 0); + // Early Skip Mode Decision + if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) { + + int num_rdo_cands = 0; + int8_t mrg_cands[MRG_MAX_NUM_CANDS] = { 0, 1, 2, 3, 4 }; + double mrg_costs[MRG_MAX_NUM_CANDS] = { MAX_DOUBLE }; + + // Check motion vector constraints and perform rough search + for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { + + cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; + cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; + cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; + cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; + cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; + cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; + cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + + // Don't try merge candidates that don't satisfy mv constraints. + if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || + !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1])) + { + continue; + } + + if (cfg->rdo >= 2) { + + kvz_lcu_set_trdepth(lcu, x, y, depth, depth); + kvz_inter_recon_cu(state, lcu, x, y, width); + mrg_costs[merge_idx] = kvz_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + } + + num_rdo_cands++; + } + + + if (cfg->rdo >= 2) { + // Sort candidates by cost + kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands); + } + + // Limit by availability + // TODO: Do not limit to just 1 + num_rdo_cands = MIN(1, num_rdo_cands); + + // RDO search + for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { + + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = mrg_cands[merge_rdo_idx]; + cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; + cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; + cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; + cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; + cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; + cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; + cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + kvz_lcu_set_trdepth(lcu, x, y, depth, depth); + kvz_inter_recon_cu(state, lcu, x, y, width); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu); + + if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) { + continue; + } + else if(state->encoder_control->chroma_format != KVZ_CSP_400) { + + kvz_quantize_lcu_residual(state, false, true, x, y, depth, cur_cu, lcu); + if (!cbf_is_set_any(cur_cu->cbf, depth)) { + cur_cu->type = CU_INTER; + cur_cu->merge_idx = merge_idx; + cur_cu->skipped = true; + *inter_cost = 0.0; // TODO: Check this + *inter_bitcost = 0; // TODO: Check this + return; + } + } + } + } + for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; diff --git a/src/search_intra.c b/src/search_intra.c index 9c31d95c..eff408cd 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -41,29 +41,6 @@ #endif -/** - * \brief Sort modes and costs to ascending order according to costs. - */ -static INLINE void sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length) -{ - // Length is always between 5 and 23, and is either 21, 17, 9 or 8 about - // 60% of the time, so there should be no need for anything more complex - // than insertion sort. - for (uint8_t i = 1; i < length; ++i) { - const double cur_cost = costs[i]; - const int8_t cur_mode = modes[i]; - uint8_t j = i; - while (j > 0 && cur_cost < costs[j - 1]) { - costs[j] = costs[j - 1]; - modes[j] = modes[j - 1]; - --j; - } - costs[j] = cur_cost; - modes[j] = cur_mode; - } -} - - /** * \brief Select mode with the smallest cost. */ @@ -367,7 +344,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state, costs[i] += satd_func(pred, orig_block); } - sort_modes(modes, costs, 5); + kvz_sort_modes(modes, costs, 5); } @@ -630,7 +607,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } // Update order according to new costs - sort_modes(modes, costs, modes_to_check); + kvz_sort_modes(modes, costs, modes_to_check); // The best transform split hierarchy is not saved anywhere, so to get the // transform split hierarchy the search has to be performed again with the @@ -868,7 +845,7 @@ void kvz_search_cu_intra(encoder_state_t * const state, } int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search); - sort_modes(modes, costs, number_of_modes); + kvz_sort_modes(modes, costs, number_of_modes); number_of_modes = search_intra_rdo(state, x_px, y_px, depth, ref_pixels, LCU_WIDTH,