Early skip

This commit is contained in:
Ari Lemmetti 2019-07-06 21:34:29 +03:00
parent 5bfe585e74
commit 4097331fd6
10 changed files with 207 additions and 78 deletions

View file

@ -207,6 +207,11 @@ Compression tools:
when QP is below the limit. [0]
--(no-)intra-rdo-et : Check intra modes in rdo stage only until
a zero coefficient CU is found. [disabled]
--(no-)early-skip : Try to find skip cu from merge candidates.
Perform no further search if skip is found.
For rd=0..1: Try the first candidate.
For rd=2.. : Try the best candidate based
on luma satd cost. [enabled]
--(no-)implicit-rdpcm : Implicit residual DPCM. Currently only supported
with lossless coding. [disabled]
--(no-)tmvp : Temporal motion vector prediction [enabled]

View file

@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
#
# Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
ver_major=4
ver_minor=1
ver_minor=2
ver_release=0
# Prevents configure from adding a lot of defines to the CFLAGS

View file

@ -1,4 +1,4 @@
.TH KVAZAAR "1" "May 2019" "kvazaar v1.2.0" "User Commands"
.TH KVAZAAR "1" "July 2019" "kvazaar v1.2.0" "User Commands"
.SH NAME
kvazaar \- open source HEVC encoder
.SH SYNOPSIS
@ -278,6 +278,13 @@ Skip CABAC cost for residual coefficients
Check intra modes in rdo stage only until
a zero coefficient CU is found. [disabled]
.TP
\fB\-\-(no\-)early\-skip
Try to find skip cu from merge candidates.
Perform no further search if skip is found.
For rd=0..1: Try the first candidate.
For rd=2.. : Try the best candidate based
on luma satd cost. [enabled]
.TP
\fB\-\-(no\-)implicit\-rdpcm
Implicit residual DPCM. Currently only supported
with lossless coding. [disabled]

View file

@ -139,6 +139,7 @@ int kvz_config_init(kvz_config *cfg)
cfg->scaling_list = KVZ_SCALING_LIST_OFF;
cfg->max_merge = 5;
cfg->early_skip = true;
return 1;
}
@ -385,7 +386,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
static const char * const scaling_list_names[] = { "off", "custom", "default", NULL };
static const char * const preset_values[11][23*2] = {
static const char * const preset_values[11][24*2] = {
{
"ultrafast",
"rd", "0",
@ -409,6 +410,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "28",
NULL
},
@ -435,6 +437,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "28",
NULL
},
@ -461,6 +464,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "28",
NULL
},
@ -487,6 +491,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -513,6 +518,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -539,6 +545,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "on",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -565,6 +572,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "on",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -591,6 +599,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "off",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -617,6 +626,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "zero",
"me-early-termination", "off",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -643,6 +653,7 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
"cu-split-termination", "off",
"me-early-termination", "off",
"intra-rdo-et", "0",
"early-skip", "1",
"fast-residual-cost", "0",
NULL
},
@ -1236,6 +1247,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
}
cfg->max_merge = (uint8_t)max_merge;
}
else if OPT("early-skip") {
cfg->early_skip = (bool)atobool(value);
}
else {
return 0;
}

View file

@ -135,6 +135,8 @@ static const struct option long_options[] = {
{ "no-open-gop", no_argument, NULL, 0 },
{ "scaling-list", required_argument, NULL, 0 },
{ "max-merge", required_argument, NULL, 0 },
{ "early-skip", no_argument, NULL, 0 },
{ "no-early-skip", no_argument, NULL, 0 },
{0, 0, 0, 0}
};
@ -489,6 +491,11 @@ void print_help(void)
" when QP is below the limit. [0]\n"
" --(no-)intra-rdo-et : Check intra modes in rdo stage only until\n"
" a zero coefficient CU is found. [disabled]\n"
" --(no-)early-skip : Try to find skip cu from merge candidates.\n"
" Perform no further search if skip is found.\n"
" For rd=0..1: Try the first candidate.\n"
" For rd=2.. : Try the best candidate based\n"
" on luma satd cost. [enabled]\n"
" --(no-)implicit-rdpcm : Implicit residual DPCM. Currently only supported\n"
" with lossless coding. [disabled]\n"
" --(no-)tmvp : Temporal motion vector prediction [enabled]\n"

View file

@ -387,6 +387,9 @@ typedef struct kvz_config
/** \brief Maximum number of merge cadidates */
uint8_t max_merge;
/** \brief Enable Early Skip Mode Decision */
uint8_t early_skip;
} kvz_config;
/**

View file

@ -403,6 +403,30 @@ static double calc_mode_bits(const encoder_state_t *state,
}
/**
* \brief Sort modes and costs to ascending order according to costs.
*/
void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length)
{
// Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about
// 60% of the time, so there should be no need for anything more complex
// than insertion sort.
// Length for merge is 5 or less.
for (uint8_t i = 1; i < length; ++i) {
const double cur_cost = costs[i];
const int8_t cur_mode = modes[i];
uint8_t j = i;
while (j > 0 && cur_cost < costs[j - 1]) {
costs[j] = costs[j - 1];
modes[j] = modes[j - 1];
--j;
}
costs[j] = cur_cost;
modes[j] = cur_mode;
}
}
static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
{
vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) };
@ -482,29 +506,31 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
cur_cu->type = CU_INTER;
}
// Try SMP and AMP partitioning.
static const part_mode_t mp_modes[] = {
// SMP
SIZE_2NxN, SIZE_Nx2N,
// AMP
SIZE_2NxnU, SIZE_2NxnD,
SIZE_nLx2N, SIZE_nRx2N,
};
if (!cur_cu->skipped) {
// Try SMP and AMP partitioning.
static const part_mode_t mp_modes[] = {
// SMP
SIZE_2NxN, SIZE_Nx2N,
// AMP
SIZE_2NxnU, SIZE_2NxnD,
SIZE_nLx2N, SIZE_nRx2N,
};
const int first_mode = ctrl->cfg.smp_enable ? 0 : 2;
const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
for (int i = first_mode; i <= last_mode; ++i) {
kvz_search_cu_smp(state,
x, y,
depth,
mp_modes[i],
&work_tree[depth + 1],
&mode_cost, &mode_bitcost);
if (mode_cost < cost) {
cost = mode_cost;
inter_bitcost = mode_bitcost;
// Copy inter prediction info to current level.
copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu);
const int first_mode = ctrl->cfg.smp_enable ? 0 : 2;
const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
for (int i = first_mode; i <= last_mode; ++i) {
kvz_search_cu_smp(state,
x, y,
depth,
mp_modes[i],
&work_tree[depth + 1],
&mode_cost, &mode_bitcost);
if (mode_cost < cost) {
cost = mode_cost;
inter_bitcost = mode_bitcost;
// Copy inter prediction info to current level.
copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu);
}
}
}
}
@ -512,9 +538,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
// Try to skip intra search in rd==0 mode.
// This can be quite severe on bdrate. It might be better to do this
// decision after reconstructing the inter frame.
bool skip_intra = state->encoder_control->cfg.rdo == 0
bool skip_intra = (state->encoder_control->cfg.rdo == 0
&& cur_cu->type != CU_NOTSET
&& cost / (cu_width * cu_width) < INTRA_THRESHOLD;
&& cost / (cu_width * cu_width) < INTRA_THRESHOLD)
|| cur_cu->skipped;
int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max;
bool can_use_intra =
@ -567,43 +594,47 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
NULL, lcu);
}
} else if (cur_cu->type == CU_INTER) {
// Reset transform depth because intra messes with them.
// This will no longer be necessary if the transform depths are not shared.
int tr_depth = MAX(1, depth);
if (cur_cu->part_size != SIZE_2Nx2N) {
tr_depth = depth + 1;
}
kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth);
kvz_inter_recon_cu(state, lcu, x, y, cu_width);
if (!cur_cu->skipped) {
// Reset transform depth because intra messes with them.
// This will no longer be necessary if the transform depths are not shared.
int tr_depth = MAX(1, depth);
if (cur_cu->part_size != SIZE_2Nx2N) {
tr_depth = depth + 1;
}
kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth);
if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
//Calculate cost for zero coeffs
inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
kvz_inter_recon_cu(state, lcu, x, y, cu_width);
}
if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
//Calculate cost for zero coeffs
inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
kvz_quantize_lcu_residual(state,
true, has_chroma,
x, y, depth,
NULL,
lcu);
}
int cbf = cbf_is_set_any(cur_cu->cbf, depth);
const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
kvz_quantize_lcu_residual(state,
true, has_chroma,
x, y, depth,
NULL,
lcu);
if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
cur_cu->merged = 0;
cur_cu->skipped = 1;
// Selecting skip reduces bits needed to code the CU
if (inter_bitcost > 1) {
inter_bitcost -= 1;
int cbf = cbf_is_set_any(cur_cu->cbf, depth);
if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
cur_cu->merged = 0;
cur_cu->skipped = 1;
// Selecting skip reduces bits needed to code the CU
if (inter_bitcost > 1) {
inter_bitcost -= 1;
}
}
}
lcu_set_inter(lcu, x_local, y_local, cu_width);
lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
}
}
if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
if (state->encoder_control->chroma_format != KVZ_CSP_400) {

View file

@ -31,6 +31,7 @@
#include "global.h" // IWYU pragma: keep
#include "image.h"
void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf);

View file

@ -1510,6 +1510,90 @@ static void search_pu_inter(encoder_state_t * const state,
CU_SET_MV_CAND(cur_cu, 0, 0);
CU_SET_MV_CAND(cur_cu, 1, 0);
// Early Skip Mode Decision
if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) {
int num_rdo_cands = 0;
int8_t mrg_cands[MRG_MAX_NUM_CANDS] = { 0, 1, 2, 3, 4 };
double mrg_costs[MRG_MAX_NUM_CANDS] = { MAX_DOUBLE };
// Check motion vector constraints and perform rough search
for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) {
cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir;
cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0];
cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1];
cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0];
cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1];
cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0];
cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1];
// Don't try merge candidates that don't satisfy mv constraints.
if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) ||
!fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]))
{
continue;
}
if (cfg->rdo >= 2) {
kvz_lcu_set_trdepth(lcu, x, y, depth, depth);
kvz_inter_recon_cu(state, lcu, x, y, width);
mrg_costs[merge_idx] = kvz_satd_any_size(width, height,
lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
}
num_rdo_cands++;
}
if (cfg->rdo >= 2) {
// Sort candidates by cost
kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands);
}
// Limit by availability
// TODO: Do not limit to just 1
num_rdo_cands = MIN(1, num_rdo_cands);
// RDO search
for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) {
// Reconstruct blocks with merge candidate.
// Check luma CBF. Then, check chroma CBFs if luma CBF is not set
// and chroma exists.
// Early terminate if merge candidate with zero CBF is found.
int merge_idx = mrg_cands[merge_rdo_idx];
cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir;
cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0];
cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1];
cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0];
cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1];
cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0];
cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1];
kvz_lcu_set_trdepth(lcu, x, y, depth, depth);
kvz_inter_recon_cu(state, lcu, x, y, width);
kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu);
if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) {
continue;
}
else if(state->encoder_control->chroma_format != KVZ_CSP_400) {
kvz_quantize_lcu_residual(state, false, true, x, y, depth, cur_cu, lcu);
if (!cbf_is_set_any(cur_cu->cbf, depth)) {
cur_cu->type = CU_INTER;
cur_cu->merge_idx = merge_idx;
cur_cu->skipped = true;
*inter_cost = 0.0; // TODO: Check this
*inter_bitcost = 0; // TODO: Check this
return;
}
}
}
}
for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
info.ref_idx = ref_idx;
info.ref = state->frame->ref->images[ref_idx];

View file

@ -41,29 +41,6 @@
#endif
/**
* \brief Sort modes and costs to ascending order according to costs.
*/
static INLINE void sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length)
{
// Length is always between 5 and 23, and is either 21, 17, 9 or 8 about
// 60% of the time, so there should be no need for anything more complex
// than insertion sort.
for (uint8_t i = 1; i < length; ++i) {
const double cur_cost = costs[i];
const int8_t cur_mode = modes[i];
uint8_t j = i;
while (j > 0 && cur_cost < costs[j - 1]) {
costs[j] = costs[j - 1];
modes[j] = modes[j - 1];
--j;
}
costs[j] = cur_cost;
modes[j] = cur_mode;
}
}
/**
* \brief Select mode with the smallest cost.
*/
@ -367,7 +344,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
costs[i] += satd_func(pred, orig_block);
}
sort_modes(modes, costs, 5);
kvz_sort_modes(modes, costs, 5);
}
@ -630,7 +607,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
}
// Update order according to new costs
sort_modes(modes, costs, modes_to_check);
kvz_sort_modes(modes, costs, modes_to_check);
// The best transform split hierarchy is not saved anywhere, so to get the
// transform split hierarchy the search has to be performed again with the
@ -868,7 +845,7 @@ void kvz_search_cu_intra(encoder_state_t * const state,
}
int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search);
sort_modes(modes, costs, number_of_modes);
kvz_sort_modes(modes, costs, number_of_modes);
number_of_modes = search_intra_rdo(state,
x_px, y_px, depth,
ref_pixels, LCU_WIDTH,