diff --git a/README.md b/README.md index cd96e124..a481d70d 100644 --- a/README.md +++ b/README.md @@ -150,11 +150,20 @@ Video structure: - frametile: Constrain within the tile. - frametilemargin: Constrain even more. --roi : Use a delta QP map for region of interest. - Reads an array of delta QP values from a text - file. The file format is: width and height of - the QP delta map followed by width*height delta - QP values in raster order. The map can be of any - size and will be scaled to the video size. + Reads an array of delta QP values from a file. + Text and binary files are supported and detected + from the file extension (.txt/.bin). If a known + extension is not found, the file is treated as + a text file. The file can include one or many + ROI frames each in the following format: + width and height of the QP delta map followed + by width * height delta QP values in raster + order. In binary format, width and height are + 32-bit integers whereas the delta QP values are + signed 8-bit values. The map can be of any size + and will be scaled to the video size. The file + reading will loop if end of the file is reached. + See roi.txt in the examples folder. --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26. in PPS and slice_qp_delta in slize header zero. --(no-)erp-aqp : Use adaptive QP for 360 degree video with diff --git a/configure.ac b/configure.ac index 3a0d1582..08a35042 100644 --- a/configure.ac +++ b/configure.ac @@ -22,8 +22,8 @@ AC_CONFIG_SRCDIR([src/encmain.c]) # - Increment when making new releases and major or minor was not changed since last release. # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html -ver_major=6 -ver_minor=7 +ver_major=7 +ver_minor=2 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS diff --git a/doc/kvazaar.1 b/doc/kvazaar.1 index f6f7821a..c3f80f6b 100644 --- a/doc/kvazaar.1 +++ b/doc/kvazaar.1 @@ -164,11 +164,20 @@ Constrain movement vectors. [none] .TP \fB\-\-roi Use a delta QP map for region of interest. -Reads an array of delta QP values from a text -file. The file format is: width and height of -the QP delta map followed by width*height delta -QP values in raster order. The map can be of any -size and will be scaled to the video size. +Reads an array of delta QP values from a file. +Text and binary files are supported and detected +from the file extension (.txt/.bin). If a known +extension is not found, the file is treated as +a text file. The file can include one or many +ROI frames each in the following format: +width and height of the QP delta map followed +by width * height delta QP values in raster +order. In binary format, width and height are +32\-bit integers whereas the delta QP values are +signed 8\-bit values. The map can be of any size +and will be scaled to the video size. The file +reading will loop if end of the file is reached. +See roi.txt in the examples folder. .TP \fB\-\-set\-qp\-in\-cu Set QP at CU level keeping pic_init_qp_minus26. diff --git a/src/bitstream.c b/src/bitstream.c index c524e6e2..3ba866eb 100644 --- a/src/bitstream.c +++ b/src/bitstream.c @@ -33,6 +33,7 @@ #include "bitstream.h" #include +#include #include #include diff --git a/src/cabac.c b/src/cabac.c index 26ff0e34..a35358ae 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -70,6 +70,7 @@ void kvz_cabac_start(cabac_data_t * const data) data->num_buffered_bytes = 0; data->buffered_byte = 0xff; data->only_count = 0; // By default, write bits out + data->update = 0; } /** @@ -349,26 +350,28 @@ void kvz_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t rem /** * \brief */ -void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol) +void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, + cabac_ctx_t * const ctx, + uint32_t symbol, + const int32_t offset, + const uint32_t max_symbol, + double* bits_out) { int8_t code_last = max_symbol > symbol; assert(symbol <= max_symbol); if (!max_symbol) return; - - data->cur_ctx = ctx; - CABAC_BIN(data, symbol, "ums"); + + CABAC_FBITS_UPDATE(data, ctx, symbol, *bits_out, "ums"); if (!symbol) return; while (--symbol) { - //data->cur_ctx = &ctx[offset]; - CABAC_BIN(data, 1, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums"); } if (code_last) { - //data->cur_ctx = &ctx[offset]; - CABAC_BIN(data, 0, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums"); } } @@ -405,7 +408,7 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int /** * \brief */ -void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, +uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, cabac_data_t * const data, uint32_t symbol, uint32_t count) @@ -426,4 +429,5 @@ void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state, num_bins += count; CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb"); + return num_bins; } diff --git a/src/cabac.h b/src/cabac.h index 8489333c..92c2d6b8 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -59,7 +59,8 @@ typedef struct uint32_t buffered_byte; int32_t num_buffered_bytes; int32_t bits_left; - int8_t only_count; + int8_t only_count : 4; + int8_t update : 4; bitstream_t *stream; // CONTEXTS @@ -140,11 +141,11 @@ void kvz_cabac_write(cabac_data_t *data); void kvz_cabac_finish(cabac_data_t *data); void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol, uint32_t r_param, const unsigned int cutoff); -void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, +uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, uint32_t symbol, uint32_t count); void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, - uint32_t symbol, int32_t offset, - uint32_t max_symbol); + uint32_t symbol, int32_t offset, + uint32_t max_symbol, double* bits_out); void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol); #define CTX_PROB_BITS 15 @@ -153,6 +154,18 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol #define CTX_MASK_0 (~(~0u << CTX_PROB_BITS_0) << (CTX_PROB_BITS - CTX_PROB_BITS_0)) #define CTX_MASK_1 (~(~0u << CTX_PROB_BITS_1) << (CTX_PROB_BITS - CTX_PROB_BITS_1)) +// Floating point fractional bits, derived from kvz_entropy_bits +extern const float kvz_f_entropy_bits[512]; +#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] + +#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ + if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \ + if((cabac)->update) {\ + (cabac)->cur_ctx = ctx;\ + CABAC_BIN((cabac), (val), (name));\ + } \ +} while(0) + // Macros #define CTX_GET_STATE(ctx) ( (ctx)->state[0]+(ctx)->state[1] ) #define CTX_STATE(ctx) ( CTX_GET_STATE(ctx)>>8 ) diff --git a/src/cfg.c b/src/cfg.c index 2be8c8c6..8b74e8d5 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -149,9 +149,9 @@ int kvz_config_init(kvz_config *cfg) cfg->gop_lp_definition.t = 1; cfg->open_gop = true; - cfg->roi.width = 0; - cfg->roi.height = 0; - cfg->roi.dqps = NULL; + cfg->roi.file_path = NULL; + cfg->roi.format = KVZ_ROI_TXT; + cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -214,6 +214,9 @@ int kvz_config_init(kvz_config *cfg) cfg->cclm = 0; + + cfg->combine_intra_cus = 1; + cfg->force_inter = 0; return 1; } @@ -221,11 +224,11 @@ int kvz_config_destroy(kvz_config *cfg) { if (cfg) { FREE_POINTER(cfg->cqmfile); + FREE_POINTER(cfg->roi.file_path); FREE_POINTER(cfg->fast_coeff_table_fn); FREE_POINTER(cfg->tiles_width_split); FREE_POINTER(cfg->tiles_height_split); FREE_POINTER(cfg->slice_addresses_in_ts); - FREE_POINTER(cfg->roi.dqps); FREE_POINTER(cfg->fastrd_learning_outdir_fn); } free(cfg); @@ -1295,60 +1298,29 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) } else if OPT("implicit-rdpcm") cfg->implicit_rdpcm = (bool)atobool(value); + else if OPT("roi") { - // The ROI description is as follows: - // First number is width, second number is height, - // then follows width * height number of dqp values. - FILE* f = fopen(value, "rb"); - if (!f) { - fprintf(stderr, "Could not open ROI file.\n"); + static enum kvz_roi_format const formats[] = { KVZ_ROI_TXT, KVZ_ROI_BIN }; + static const char * const format_names[] = { "txt", "bin", NULL }; + + char *roi_file = strdup(value); + if (!roi_file) { + fprintf(stderr, "Failed to allocate memory for ROI file name.\n"); return 0; } + FREE_POINTER(cfg->roi.file_path); + cfg->roi.file_path = roi_file; - int width = 0; - int height = 0; - if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) { - fprintf(stderr, "Failed to read ROI size.\n"); - fclose(f); - return 0; + // Get file extension or the substring after the last dot + char *maybe_extension = strrchr(cfg->roi.file_path, '.'); + if (!maybe_extension) { + cfg->roi.format = KVZ_ROI_TXT; + } else { + maybe_extension++; + int8_t format; + bool unknown_format = !parse_enum(maybe_extension, format_names, &format); + cfg->roi.format = unknown_format ? KVZ_ROI_TXT : formats[format]; } - - if (width <= 0 || height <= 0) { - fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height); - fclose(f); - return 0; - } - - if (width > 10000 || height > 10000) { - fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); - fclose(f); - return 0; - } - - const unsigned size = width * height; - int8_t *dqp_array = calloc((size_t)size, sizeof(cfg->roi.dqps[0])); - if (!dqp_array) { - fprintf(stderr, "Failed to allocate memory for ROI table.\n"); - fclose(f); - return 0; - } - - FREE_POINTER(cfg->roi.dqps); - cfg->roi.dqps = dqp_array; - cfg->roi.width = width; - cfg->roi.height = height; - - for (int i = 0; i < size; ++i) { - int number; // Need a pointer to int for fscanf - if (fscanf(f, "%d", &number) != 1) { - fprintf(stderr, "Reading ROI file failed.\n"); - fclose(f); - return 0; - } - dqp_array[i] = CLIP(-51, 51, number); - } - - fclose(f); } else if OPT("set-qp-in-cu") { cfg->set_qp_in_cu = (bool)atobool(value); @@ -1502,6 +1474,12 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) else if OPT("cclm") { cfg->cclm = (bool)atobool(value); } + else if OPT("combine-intra-cus") { + cfg->combine_intra_cus = atobool(value); + } + else if OPT("force-inter") { + cfg->force_inter = atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index baa5a07a..a68a5ce0 100644 --- a/src/cli.c +++ b/src/cli.c @@ -145,6 +145,7 @@ static const struct option long_options[] = { { "force-level", required_argument, NULL, 0 }, { "high-tier", no_argument, NULL, 0 }, { "me-steps", required_argument, NULL, 0 }, + { "roi-file", required_argument, NULL, 0 }, { "fast-residual-cost", required_argument, NULL, 0 }, { "set-qp-in-cu", no_argument, NULL, 0 }, { "open-gop", no_argument, NULL, 0 }, @@ -183,6 +184,10 @@ static const struct option long_options[] = { { "no-amvr", no_argument, NULL, 0 }, { "cclm", no_argument, NULL, 0 }, { "no-cclm", no_argument, NULL, 0 }, + { "combine-intra-cus", no_argument, NULL, 0 }, + { "no-combine-intra-cus", no_argument, NULL, 0 }, + { "force-inter", no_argument, NULL, 0 }, + { "no-force-inter", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -504,11 +509,20 @@ void print_help(void) " - frametile: Constrain within the tile.\n" " - frametilemargin: Constrain even more.\n" " --roi : Use a delta QP map for region of interest.\n" - " Reads an array of delta QP values from a text\n" - " file. The file format is: width and height of\n" - " the QP delta map followed by width*height delta\n" - " QP values in raster order. The map can be of any\n" - " size and will be scaled to the video size.\n" + " Reads an array of delta QP values from a file.\n" + " Text and binary files are supported and detected\n" + " from the file extension (.txt/.bin). If a known\n" + " extension is not found, the file is treated as\n" + " a text file. The file can include one or many\n" + " ROI frames each in the following format:\n" + " width and height of the QP delta map followed\n" + " by width * height delta QP values in raster\n" + " order. In binary format, width and height are\n" + " 32-bit integers whereas the delta QP values are\n" + " signed 8-bit values. The map can be of any size\n" + " and will be scaled to the video size. The file\n" + " reading will loop if end of the file is reached.\n" + " See roi.txt in the examples folder.\n" " --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26.\n" " in PPS and slice_qp_delta in slize header zero.\n" " --(no-)erp-aqp : Use adaptive QP for 360 degree video with\n" @@ -594,6 +608,16 @@ void print_help(void) " --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n" " learning trees, overrides the\n" " --pu-depth-intra parameter. [disabled]\n" + " --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n" + " on lower depth even when search is not\n" + " performed on said depth. Should only\n" + " be disabled if cus absolutely must not\n" + " be larger than limited by the search.\n" + " [enabled]" + " --force-inter : Force the encoder to use inter always.\n" + " This is mostly for debugging and is not\n" + " guaranteed to produce sensible bitstream or\n" + " work at all. [disabled]" " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" diff --git a/src/encmain.c b/src/encmain.c index 0cdea6f7..1d9175fc 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -441,6 +441,7 @@ int main(int argc, char *argv[]) FILE *input = NULL; //!< input file (YUV) FILE *output = NULL; //!< output file (HEVC NAL stream) FILE *recout = NULL; //!< reconstructed YUV output, --debug + FILE *roifile = NULL; clock_t start_time = clock(); clock_t encoding_start_cpu_time; KVZ_CLOCK_T encoding_start_real_time; @@ -584,7 +585,7 @@ int main(int argc, char *argv[]) // Give arguments via struct to the input thread input_handler_args in_args = { .available_input_slots = available_input_slots, - .filled_input_slots = filled_input_slots, + .filled_input_slots = filled_input_slots, .input = input, .api = api, @@ -825,6 +826,7 @@ done: if (input) fclose(input); if (output) fclose(output); if (recout) fclose(recout); + if (roifile) fclose(roifile); DBG_YUVIEW_CLEANUP(); CHECKPOINTS_FINALIZE(); diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 21c368e0..a6adb249 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -581,7 +581,7 @@ static void encode_transform_coeff(encoder_state_t * const state, // cu_qp_delta_abs prefix cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0]; - kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5); + kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL); if (qp_delta_abs >= 5) { // cu_qp_delta_abs suffix @@ -610,17 +610,19 @@ static void encode_transform_coeff(encoder_state_t * const state, * \param depth Depth from LCU. * \return if non-zero mvd is coded */ -static bool encode_inter_prediction_unit(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int width, int height, - int depth) +int kvz_encode_inter_prediction_unit(encoder_state_t * const state, + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + int x, int y, int width, int height, + int depth, lcu_t* lcu, double* bits_out) { // Mergeflag int16_t num_cand = 0; bool non_zero_mvd = false; - cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, cur_cu->merged, "MergeFlag"); + double bits = 0; + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag"); + num_cand = state->encoder_control->cfg.max_merge; if (cur_cu->merged) { //merge if (num_cand > 1) { @@ -628,10 +630,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != cur_cu->merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac,symbol,"MergeIndex"); + if(cabac->only_count) bits += 1; } if (symbol == 0) break; } @@ -650,12 +652,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4 uint32_t inter_dir_ctx = (7 - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height) + 1) >> 1)); - cabac->cur_ctx = &(cabac->ctx.inter_dir[inter_dir_ctx]); - CABAC_BIN(cabac, (inter_dir == 3), "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc"); } if (inter_dir < 3) { - cabac->cur_ctx = &(cabac->ctx.inter_dir[5]); - CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[5]), (inter_dir == 2), bits, "inter_pred_idc"); } } @@ -674,20 +674,21 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, if (ref_LX_size > 1) { // parseRefFrmIdx int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame > 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0 && ref_LX_size > 2) { cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); - CABAC_BIN(cabac, (ref_frame > 1), "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), (ref_frame > 1), bits, "ref_idx_lX"); if (ref_frame > 1 && ref_LX_size > 3) { for (int idx = 3; idx < ref_LX_size; idx++) { uint8_t val = (ref_frame > idx - 1) ? 1 : 0; CABAC_BIN_EP(cabac, val, "ref_idx_lX"); + if (cabac->only_count) bits += 1; if (!val) break; + } } } @@ -697,28 +698,37 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) { mv_t mv_cand[2][2]; - kvz_inter_get_mv_cand_cua( + if (lcu) { + kvz_inter_get_mv_cand( + state, + x, y, width, height, + mv_cand, cur_cu, + lcu, ref_list_idx); + } + else { + kvz_inter_get_mv_cand_cua( state, x, y, width, height, - mv_cand, cur_cu, ref_list_idx); + mv_cand, cur_cu, ref_list_idx + ); + } uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); mv_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; mv_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; kvz_change_precision(INTERNAL_MV_PREC, kvz_g_imv_to_prec[KVZ_IMV_OFF], &mvd_hor, &mvd_ver); - - kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver); + kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out); non_zero_mvd |= (mvd_hor != 0) || (mvd_ver != 0); } // Signal which candidate MV to use - cabac->cur_ctx = &(cabac->ctx.mvp_idx_model); - CABAC_BIN(cabac, CU_GET_MV_CAND(cur_cu, ref_list_idx), "mvp_flag"); + CABAC_FBITS_UPDATE(cabac,&(cabac->ctx.mvp_idx_model), CU_GET_MV_CAND(cur_cu, ref_list_idx), bits, "mvp_flag"); } // for ref_list } // if !merge + if(bits_out) *bits_out += bits; return non_zero_mvd; } @@ -807,7 +817,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c static void encode_intra_coding_unit(encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth, lcu_coeff_t* coeff) + int x, int y, int depth, lcu_t* lcu, lcu_coeff_t* coeff, double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual[4]; @@ -1050,6 +1060,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state, kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT); } + if (cabac->only_count && bits_out) *bits_out += 5; } } @@ -1057,14 +1068,17 @@ static void encode_intra_coding_unit(encoder_state_t * const state, if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) { encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); } + // if we are counting bits, the cost for transform coeffs is done separately + // To get the distortion at the same time + if (!cabac->only_count) { + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); + encode_mts_idx(state, cabac, cur_cu); - encode_mts_idx(state, cabac, cur_cu); - - if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); + if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) { + encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); + } } } @@ -1105,32 +1119,32 @@ static void encode_part_mode(encoder_state_t * const state, // log2CbSize == MinCbLog2SizeY | 0 1 2 bypass // log2CbSize > MinCbLog2SizeY | 0 1 3 bypass // ------------------------------+------------------ - + double bits = 0; if (cur_cu->type == CU_INTRA) { if (depth == MAX_DEPTH) { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); } else { - CABAC_BIN(cabac, 0, "part_mode NxN"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN"); } } } else { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode split"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split"); cabac->cur_ctx = &(cabac->ctx.part_size_model[1]); if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_2NxnD) { - CABAC_BIN(cabac, 1, "part_mode vertical"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical"); } else { - CABAC_BIN(cabac, 0, "part_mode horizontal"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal"); } if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) { @@ -1138,19 +1152,22 @@ static void encode_part_mode(encoder_state_t * const state, if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_Nx2N) { - CABAC_BIN(cabac, 1, "part_mode SMP"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode AMP"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP"); if (cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_nLx2N) { CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } else { CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } } } + return bits; } **/ @@ -1191,7 +1208,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu; bool border = border_x || border_y; /*!< are we in any border CU */ - if (depth <= ctrl->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { state->must_code_qp_delta = true; } @@ -1456,7 +1473,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y); - non_zero_mvd |= encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth); + non_zero_mvd |= kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL); DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu); kvz_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu); } @@ -1494,7 +1511,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state, } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, coeff); + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, coeff, NULL); } else { @@ -1511,11 +1528,128 @@ end: } +double kvz_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu, cu_info_t* cur_cu) { + double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; + + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + + const int cu_width = LCU_WIDTH >> depth; + + const cu_info_t* left_cu = NULL, *above_cu = NULL; + if (x) { + left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + if (y) { + above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1); + } + uint8_t split_model = 0; + + // Absolute coordinates + uint16_t abs_x = x + state->tile->offset_x; + uint16_t abs_y = y + state->tile->offset_y; + + // Check for slice border + bool border_x = ctrl->in.width < abs_x + cu_width; + bool border_y = ctrl->in.height < abs_y + cu_width; + bool border = border_x || border_y; /*!< are we in any border CU */ + + if (depth <= state->frame->max_qp_delta_depth) { + state->must_code_qp_delta = true; + } + + // When not in MAX_DEPTH, insert split flag and split the blocks if needed + if (depth != MAX_DEPTH) { + // Implicit split flag when on border + if (!border) { + // Get left and top block split_flags and if they are present and true, increase model number + if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { + split_model++; + } + + if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { + split_model++; + } + + // This mocks encoding the current CU so it should be never split + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), 0, bits, "SplitFlag"); + } + } + + // Encode skip flag + if (state->frame->slicetype != KVZ_SLICE_I) { + int8_t ctx_skip = 0; + + if (left_cu && left_cu->skipped) { + ctx_skip++; + } + if (above_cu && above_cu->skipped) { + ctx_skip++; + } + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag"); + + if (cur_cu->skipped) { + int16_t num_cand = state->encoder_control->cfg.max_merge; + if (num_cand > 1) { + for (int ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != cur_cu->merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); + } + else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + if(cabac->only_count) bits += 1; + } + if (symbol == 0) { + break; + } + } + } + return bits; + } + } + // Prediction mode + if (state->frame->slicetype != KVZ_SLICE_I && cu_width != 4) { + + int8_t ctx_predmode = 0; + + if ((left_cu && left_cu->type == CU_INTRA) || (above_cu && above_cu->type == CU_INTRA)) { + ctx_predmode = 1; + } + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model[ctx_predmode]), (cur_cu->type == CU_INTRA), bits, "PredMode"); + } + + if (cur_cu->type == CU_INTER) { + const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; + + for (int i = 0; i < num_pu; ++i) { + const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); + const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); + const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); + const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); + const cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + + kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits); + } + } + else if (cur_cu->type == CU_INTRA) { + encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, NULL, &bits); + } + return bits; +} + void kvz_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver) + int32_t mvd_ver, double* bits_out) { const int8_t hor_abs_gr0 = mvd_hor != 0; const int8_t ver_abs_gr0 = mvd_ver != 0; @@ -1523,29 +1657,33 @@ void kvz_encode_mvd(encoder_state_t * const state, const uint32_t mvd_ver_abs = abs(mvd_ver); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0]; - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver"); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1]; if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor"); } if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver"); } if (hor_abs_gr0) { if (mvd_hor_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + if(cabac->only_count) *bits_out += bits; } uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1; CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); + if (cabac->only_count) *bits_out += 1; } if (ver_abs_gr0) { if (mvd_ver_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + if (cabac->only_count) *bits_out += bits; } uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1; CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); + if (cabac->only_count) *bits_out += 1; } } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index ea792845..24f2759d 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -56,7 +56,22 @@ void kvz_encode_ts_residual(encoder_state_t* const state, void kvz_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver); + int32_t mvd_ver, + double* bits_out); + +double kvz_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu, cu_info_t* cur_cu); + +int kvz_encode_inter_prediction_unit(encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int width, int height, + int depth, + lcu_t* lcu, + double* bits_out); void kvz_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, diff --git a/src/encoder.c b/src/encoder.c index 98d87690..6ecddb86 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -32,7 +32,6 @@ #include "encoder.h" -// This define is required for M_PI on Windows. #define _USE_MATH_DEFINES #include #include @@ -45,14 +44,6 @@ #include "kvz_math.h" #include "fast_coeff_cost.h" -/** - * \brief Strength of QP adjustments when using adaptive QP for 360 video. - * - * Determined empirically. - */ -static const double ERP_AQP_STRENGTH = 3.0; - - static int encoder_control_init_gop_layer_weights(encoder_control_t * const); static unsigned cfg_num_threads(void) @@ -136,22 +127,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder) } -/** - * \brief Return weight for 360 degree ERP video - * - * Returns the scaling factor of area from equirectangular projection to - * spherical surface. - * - * \param y y-coordinate of the pixel - * \param h height of the picture - */ -static double ws_weight(int y, int h) -{ - return cos((y - 0.5 * h + 0.5) * (M_PI / h)); -} - - - /** * \brief Update ROI QPs for 360 video with equirectangular projection. * @@ -162,55 +137,6 @@ static double ws_weight(int y, int h) * \param orig_width width of orig_roi * \param orig_height height of orig_roi */ -static void init_erp_aqp_roi(encoder_control_t* encoder, - int8_t *orig_roi, - int32_t orig_width, - int32_t orig_height) -{ - // Update ROI with WS-PSNR delta QPs. - int height = encoder->in.height_in_lcu; - int width = orig_roi ? orig_width : 1; - - int frame_height = encoder->in.real_height; - - encoder->cfg.roi.width = width; - encoder->cfg.roi.height = height; - encoder->cfg.roi.dqps = calloc(width * height, sizeof(orig_roi[0])); - - double total_weight = 0.0; - for (int y = 0; y < frame_height; y++) { - total_weight += ws_weight(y, frame_height); - } - - for (int y_lcu = 0; y_lcu < height; y_lcu++) { - int y_orig = LCU_WIDTH * y_lcu; - int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); - - double lcu_weight = 0.0; - for (int y = y_orig; y < y_orig + lcu_height; y++) { - lcu_weight += ws_weight(y, frame_height); - } - // Normalize. - lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); - - int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); - - if (orig_roi) { - // If a ROI array already exists, we copy the existing values to the - // new array while adding qp_delta to each. - int y_roi = y_lcu * orig_height / height; - for (int x = 0; x < width; x++) { - encoder->cfg.roi.dqps[x + y_lcu * width] = - CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta); - } - - } else { - // Otherwise, simply write qp_delta to the ROI array. - encoder->cfg.roi.dqps[y_lcu] = qp_delta; - } - } -} - static int8_t* derive_chroma_QP_mapping_table(const kvz_config* const cfg, int i) { @@ -394,6 +320,16 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) encoder->scaling_list.use_default_list = 1; } + // ROI / delta QP + if (cfg->roi.file_path) { + const char *mode[2] = { "r", "rb" }; + encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]); + if (!encoder->roi_file) { + fprintf(stderr, "Could not open ROI file.\n"); + goto init_failed; + } + } + if (cfg->fast_coeff_table_fn) { FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb"); if (fast_coeff_table_f == NULL) { @@ -435,32 +371,10 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) goto init_failed; } - if (cfg->erp_aqp) { - init_erp_aqp_roi(encoder, - cfg->roi.dqps, - cfg->roi.width, - cfg->roi.height); - - } else if (cfg->roi.dqps) { - // Copy delta QP array for ROI coding. - const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height; - encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0])); - memcpy(encoder->cfg.roi.dqps, - cfg->roi.dqps, - roi_size * sizeof(*cfg->roi.dqps)); - - } - // NOTE: When tr_depth_inter is equal to 0, the transform is still split // for SMP and AMP partition units. encoder->tr_depth_inter = 0; - if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { - encoder->max_qp_delta_depth = 0; - } else { - encoder->max_qp_delta_depth = -1; - } - //Tiles encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 || encoder->cfg.tiles_height_count > 1; @@ -761,7 +675,7 @@ void kvz_encoder_control_free(encoder_control_t *const encoder) FREE_POINTER(encoder->tiles_tile_id); - FREE_POINTER(encoder->cfg.roi.dqps); + FREE_POINTER(encoder->cfg.roi.file_path); kvz_scalinglist_destroy(&encoder->scaling_list); @@ -773,6 +687,10 @@ void kvz_encoder_control_free(encoder_control_t *const encoder) kvz_close_rdcost_outfiles(); + if (encoder->roi_file) { + fclose(encoder->roi_file); + } + free(encoder); } diff --git a/src/encoder.h b/src/encoder.h index 6d301611..c0d0fda3 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -130,7 +130,7 @@ typedef struct encoder_control_t //! Picture weights when GOP is used. double gop_layer_weights[MAX_GOP_LAYERS]; - int8_t max_qp_delta_depth; + FILE *roi_file; int tr_depth_inter; diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 0f84b512..2f24894e 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -805,10 +805,10 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream, WRITE_U(stream, 0, 1, "pps_ref_wraparound_enabled_flag"); WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pps_init_qp_minus26"); - WRITE_U(stream, encoder->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag"); - if (encoder->max_qp_delta_depth >= 0) { + WRITE_U(stream, state->frame->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag"); + if (state->frame->max_qp_delta_depth >= 0) { // Use separate QP for each LCU when rate control is enabled. - WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth"); + WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth"); } WRITE_U(stream, 0,1, "pps_chroma_tool_offsets_present_flag"); diff --git a/src/encoderstate.c b/src/encoderstate.c index b0691ac7..db5b93f3 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -32,6 +32,9 @@ #include "encoderstate.h" + // This define is required for M_PI on Windows. +#define _USE_MATH_DEFINES +#include #include #include #include @@ -53,6 +56,13 @@ #include "strategies/strategies-picture.h" +/** + * \brief Strength of QP adjustments when using adaptive QP for 360 video. + * + * Determined empirically. + */ +static const double ERP_AQP_STRENGTH = 3.0; + int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) { int i; @@ -572,7 +582,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y); const int cu_width = LCU_WIDTH >> depth; - if (depth <= state->encoder_control->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { *prev_qp = -1; } @@ -665,7 +675,7 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search); - if (encoder->max_qp_delta_depth >= 0) { + if (state->frame->max_qp_delta_depth >= 0) { int last_qp = state->last_qp; int prev_qp = -1; set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp); @@ -716,6 +726,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) const uint64_t existing_bits = kvz_bitstream_tell(&state->stream); //Encode SAO + state->cabac.update = 1; if (encoder->cfg.sao_type) { encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]); } @@ -771,6 +782,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) kvz_cabac_start(&state->cabac); } } + state->cabac.update = 0; pthread_mutex_lock(&state->frame->rc_lock); @@ -1421,6 +1433,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64) } } + +/** + * \brief Return weight for 360 degree ERP video + * + * Returns the scaling factor of area from equirectangular projection to + * spherical surface. + * + * \param y y-coordinate of the pixel + * \param h height of the picture + */ +static double ws_weight(int y, int h) +{ + return cos((y - 0.5 * h + 0.5) * (M_PI / h)); +} + + +/** + * \brief Update ROI QPs for 360 video with equirectangular projection. + * + * Updates the ROI parameters in frame->roi. + * + * \param encoder encoder control + * \param frame frame that will have the ROI map + */ +static void init_erp_aqp_roi(const encoder_control_t *encoder, kvz_picture *frame) +{ + int8_t *orig_roi = frame->roi.roi_array; + int32_t orig_width = frame->roi.width; + int32_t orig_height = frame->roi.height; + + // Update ROI with WS-PSNR delta QPs. + int new_height = encoder->in.height_in_lcu; + int new_width = orig_roi ? orig_width : 1; + int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0])); + + int frame_height = encoder->in.real_height; + + double total_weight = 0.0; + for (int y = 0; y < frame_height; y++) { + total_weight += ws_weight(y, frame_height); + } + + for (int y_lcu = 0; y_lcu < new_height; y_lcu++) { + int y_orig = LCU_WIDTH * y_lcu; + int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); + + double lcu_weight = 0.0; + for (int y = y_orig; y < y_orig + lcu_height; y++) { + lcu_weight += ws_weight(y, frame_height); + } + // Normalize. + lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); + + int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); + + if (orig_roi) { + // If a ROI array already exists, we copy the existing values to the + // new array while adding qp_delta to each. + int y_roi = y_lcu * orig_height / new_height; + for (int x = 0; x < new_width; x++) { + new_array[x + y_lcu * new_width] = + CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta); + } + + } else { + // Otherwise, simply write qp_delta to the ROI array. + new_array[y_lcu] = qp_delta; + } + } + + // Update new values + frame->roi.width = new_width; + frame->roi.height = new_height; + frame->roi.roi_array = new_array; + FREE_POINTER(orig_roi); +} + + +static void next_roi_frame_from_file(kvz_picture *frame, FILE *file, enum kvz_roi_format format) { + // The ROI description is as follows: + // First number is width, second number is height, + // then follows width * height number of dqp values. + + // Rewind the (seekable) ROI file when end of file is reached. + // Allows a single ROI frame to be used for a whole sequence + // and looping with --loop-input. Skips possible whitespace. + if (ftell(file) != -1L) { + int c = fgetc(file); + while (format == KVZ_ROI_TXT && isspace(c)) c = fgetc(file); + ungetc(c, file); + if (c == EOF) rewind(file); + } + + int *width = &frame->roi.width; + int *height = &frame->roi.height; + + bool failed = false; + + if (format == KVZ_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height); + if (format == KVZ_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2; + + if (failed) { + fprintf(stderr, "Failed to read ROI size.\n"); + fclose(file); + assert(0); + } + + if (*width <= 0 || *height <= 0) { + fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height); + fclose(file); + assert(0); + } + + if (*width > 10000 || *height > 10000) { + fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); + fclose(file); + assert(0); + } + + const unsigned size = (*width) * (*height); + int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0])); + if (!dqp_array) { + fprintf(stderr, "Failed to allocate memory for ROI table.\n"); + fclose(file); + assert(0); + } + + FREE_POINTER(frame->roi.roi_array); + frame->roi.roi_array = dqp_array; + + if (format == KVZ_ROI_TXT) { + for (int i = 0; i < size; ++i) { + int number; // Need a pointer to int for fscanf + if (fscanf(file, "%d", &number) != 1) { + fprintf(stderr, "Reading ROI file failed.\n"); + fclose(file); + assert(0); + } + dqp_array[i] = CLIP(-51, 51, number); + } + } else if (format == KVZ_ROI_BIN) { + if (fread(dqp_array, 1, size, file) != size) { + fprintf(stderr, "Reading ROI file failed.\n"); + assert(0); + } + } +} + static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) { assert(state->type == ENCODER_STATE_TYPE_MAIN); @@ -1437,6 +1597,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); } + // ROI / delta QP maps + if (frame->roi.roi_array && cfg->roi.file_path) { + assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified."); + } + + // Read frame from the file. If no file is specified, + // ROI data should be already set by the application. + if (cfg->roi.file_path) { + next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format); + } + + if (cfg->erp_aqp) { + init_erp_aqp_roi(state->encoder_control, state->tile->frame->source); + } + // Variance adaptive quantization if (cfg->vaq) { const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -1523,6 +1698,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict } // Variance adaptive quantization - END + if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) { + state->frame->max_qp_delta_depth = 0; + } else { + state->frame->max_qp_delta_depth = -1; + } + // Use this flag to handle closed gop irap picture selection. // If set to true, irap is already set and we avoid // setting it based on the intra period @@ -1834,10 +2015,9 @@ lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y) int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp) { - const encoder_control_t *ctrl = state->encoder_control; const cu_array_t *cua = state->tile->frame->cu_array; // Quantization group width - const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); + const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); // Coordinates of the top-left corner of the quantization group const int x_qg = x & ~(qg_width - 1); diff --git a/src/encoderstate.h b/src/encoderstate.h index 8100cf31..19c0d196 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -179,6 +179,8 @@ typedef struct encoder_state_config_frame_t { */ double *aq_offsets; + int8_t max_qp_delta_depth; + /** * \brief Whether next NAL is the first NAL in the access unit. */ @@ -320,6 +322,7 @@ typedef struct encoder_state_t { bitstream_t stream; cabac_data_t cabac; + cabac_data_t search_cabac; uint32_t stats_bitstream_length; //Bitstream length written in bytes @@ -402,10 +405,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state) */ static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth) { - if (state->encoder_control->max_qp_delta_depth < 0) return false; + if (state->frame->max_qp_delta_depth < 0) return false; const int cu_width = LCU_WIDTH >> depth; - const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth; + const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth; const int right = x + cu_width; const int bottom = y + cu_width; return (right % qg_width == 0 || right >= state->tile->frame->width) && diff --git a/src/fast_coeff_cost.c b/src/fast_coeff_cost.c index 4fc392bf..cf6173db 100644 --- a/src/fast_coeff_cost.c +++ b/src/fast_coeff_cost.c @@ -40,7 +40,7 @@ static uint16_t to_q88(float f) return (uint16_t)(f * 256.0f + 0.5f); } -static uint64_t to_4xq88(const float f[4]) +static uint64_t to_4xq88(const double f[4]) { int i; uint64_t result = 0; @@ -58,9 +58,9 @@ int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_ uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp; for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) { - float curr_wts[4]; + double curr_wts[4]; - if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0, + if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0, curr_wts + 1, curr_wts + 2, curr_wts + 3) != 4) { diff --git a/src/fast_coeff_cost.h b/src/fast_coeff_cost.h index dcd67c8d..8dcfbd08 100644 --- a/src/fast_coeff_cost.h +++ b/src/fast_coeff_cost.h @@ -45,7 +45,7 @@ typedef struct { // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from // 0 to MAX_FAST_COEFF_COST_QP -static const float default_fast_coeff_cost_wts[][4] = { +static const double default_fast_coeff_cost_wts[][4] = { // Just extend it by stretching the first actual values.. {0.164240f, 4.161530f, 3.509033f, 6.928047f}, {0.164240f, 4.161530f, 3.509033f, 6.928047f}, diff --git a/src/filter.c b/src/filter.c index aad84dbc..5b2d5641 100644 --- a/src/filter.c +++ b/src/filter.c @@ -339,7 +339,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir) static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir) { - if (state->encoder_control->max_qp_delta_depth < 0) { + if (state->frame->max_qp_delta_depth < 0) { return state->qp; } diff --git a/src/image.c b/src/image.c index 39d17ea3..f3aee439 100644 --- a/src/image.c +++ b/src/image.c @@ -106,6 +106,10 @@ kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_ im->interlacing = KVZ_INTERLACING_NONE; + im->roi.roi_array = NULL; + im->roi.width = 0; + im->roi.height = 0; + return im; } @@ -132,6 +136,7 @@ void kvz_image_free(kvz_picture *const im) kvz_image_free(im->base_image); } else { free(im->fulldata_buf); + if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array); } // Make sure freed data won't be used. @@ -192,6 +197,8 @@ kvz_picture *kvz_image_make_subimage(kvz_picture *const orig_image, im->pts = 0; im->dts = 0; + im->roi = orig_image->roi; + return im; } diff --git a/src/inter.c b/src/inter.c index 9fad9619..44ac599f 100644 --- a/src/inter.c +++ b/src/inter.c @@ -1290,7 +1290,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, int32_t width, int32_t height, const merge_candidates_t *merge_cand, - const cu_info_t *cur_cu, + const cu_info_t * const cur_cu, int8_t reflist, mv_t mv_cand[2][2]) { @@ -1396,7 +1396,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, mv_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t * const cur_cu, lcu_t *lcu, int8_t reflist) { diff --git a/src/inter.h b/src/inter.h index 981017dc..017ee3a5 100644 --- a/src/inter.h +++ b/src/inter.h @@ -96,7 +96,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, mv_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t* cur_cu, lcu_t *lcu, int8_t reflist); diff --git a/src/kvazaar.h b/src/kvazaar.h index 00052f83..32e77ec2 100644 --- a/src/kvazaar.h +++ b/src/kvazaar.h @@ -267,6 +267,12 @@ enum kvz_amvr_resolution KVZ_IMV_HPEL = 3 }; +enum kvz_roi_format +{ + KVZ_ROI_TXT = 0, + KVZ_ROI_BIN = 1 +}; + // Map from input format to chroma format. #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)format) @@ -410,10 +416,9 @@ typedef struct kvz_config int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */ struct { - int32_t width; - int32_t height; - int8_t *dqps; - } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */ + char *file_path; + enum kvz_roi_format format; + } roi; /*!< \brief Specify delta QPs for region of interest coding. */ unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */ @@ -526,6 +531,12 @@ typedef struct kvz_config int8_t cclm; int8_t amvr; /* \brief Adaptive motion vector resolution parameter */ + + /** \brief whether to try combining intra cus at the lower depth when search + * is not performed at said depth*/ + uint8_t combine_intra_cus; + + uint8_t force_inter; } kvz_config; /** @@ -557,6 +568,14 @@ typedef struct kvz_picture { enum kvz_chroma_format chroma_format; int32_t ref_pocs[16]; + + struct + { + int width; + int height; + int8_t *roi_array; + } roi; + } kvz_picture; /** @@ -782,6 +801,9 @@ typedef struct kvz_api { * the bitstream, length of the bitstream, the reconstructed frame, the * original frame and frame info in data_out, len_out, pic_out, src_out and * info_out, respectively. Otherwise, set the output parameters to NULL. + * + * Region of interest (ROI) / delta QP map can be specified in the input + * picture's ROI field but only when a ROI file is not used. * * After passing all of the input frames, the caller should keep calling this * function with pic_in set to NULL, until no more data is returned in the diff --git a/src/rate_control.c b/src/rate_control.c index de4046b0..8196d7de 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -1088,17 +1088,20 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state, const encoder_control_t * const ctrl = state->encoder_control; lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y); - if (ctrl->cfg.roi.dqps != NULL) { - vector2d_t lcu = { + if (state->tile->frame->source->roi.roi_array) { + vector2d_t lcu_vec = { pos.x + state->tile->lcu_offset_x, pos.y + state->tile->lcu_offset_y }; vector2d_t roi = { - lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu, - lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu + lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu, + lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu }; - int roi_index = roi.x + roi.y * ctrl->cfg.roi.width; - int dqp = ctrl->cfg.roi.dqps[roi_index]; + int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width; + int dqp = state->tile->frame->source->roi.roi_array[roi_index]; + if(dqp != 0) { + pos.x = 0; + } state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lambda(state, state->qp); state->lambda_sqrt = sqrt(state->lambda); diff --git a/src/rdo.c b/src/rdo.c index be85b817..2ead71df 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -315,12 +315,12 @@ static INLINE uint32_t get_coeff_cabac_cost( // Take a copy of the CABAC so that we don't overwrite the contexts when // counting the bits. cabac_data_t cabac_copy; - memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy)); + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); // Clear bytes and bits and set mode to "count" cabac_copy.only_count = 1; - cabac_copy.num_buffered_bytes = 0; - cabac_copy.bits_left = 23; + int num_buffered_bytes = cabac_copy.num_buffered_bytes; + int bits_left = cabac_copy.bits_left; // Execute the coding function. // It is safe to drop the const modifier since state won't be modified @@ -343,8 +343,10 @@ static INLINE uint32_t get_coeff_cabac_cost( type, scan_mode); } - - return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3); + if(cabac_copy.update) { + memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); + } + return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3); } static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc) @@ -1741,37 +1743,33 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, /** * Calculate cost of actual motion vectors using CABAC coding */ -uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { cabac_data_t cabac_copy = *cabac; cabac_copy.only_count = 1; - + double bits = 0; // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver); + kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits); - uint32_t bitcost = - ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) - - ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)); - - return bitcost; + return bits; } /** MVD cost calculation with CABAC * \returns int * Calculates Motion Vector cost and related costs using CABAC coding */ -uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, - int x, - int y, - int mv_shift, - mv_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +double kvz_calc_mvd_cost_cabac(const encoder_state_t * state, + int x, + int y, + int mv_shift, + mv_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + double* bitcost) { cabac_data_t state_cabac_copy; cabac_data_t* cabac; @@ -1798,14 +1796,13 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, } // Store cabac state and contexts - memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t)); + memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t)); // Clear bytes and bits and set mode to "count" state_cabac_copy.only_count = 1; - state_cabac_copy.num_buffered_bytes = 0; - state_cabac_copy.bits_left = 23; cabac = &state_cabac_copy; + double bits = 0; if (!merged) { vector2d_t mvd1 = { @@ -1820,8 +1817,8 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1); kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2); - uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); - uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); + double cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); + double cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { @@ -1834,7 +1831,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, merged, "MergeFlag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag"); num_cand = state->encoder_control->cfg.max_merge; if (merged) { if (num_cand > 1) { @@ -1842,10 +1839,10 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + bits += 1; } if (symbol == 0) break; } @@ -1868,24 +1865,23 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, if (ref_list[ref_list_idx] > 1) { // parseRefFrmIdx int32_t ref_frame = ref_idx; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0) { int32_t i; int32_t ref_num = ref_list[ref_list_idx] - 2; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); + ref_frame--; for (i = 0; i < ref_num; ++i) { const uint32_t symbol = (i == ref_frame) ? 0 : 1; if (i == 0) { - CABAC_BIN(cabac, symbol, "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX"); } else { CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); + bits += 1; } if (symbol == 0) break; } @@ -1895,7 +1891,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { // It is safe to drop const here because cabac->only_count is set. - kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y); + kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits); } // Signal which candidate MV to use @@ -1905,10 +1901,10 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state, } } - *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); + *bitcost = bits; // Store bitcost before restoring cabac - return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5); + return *bitcost * state->lambda_sqrt; } void kvz_close_rdcost_outfiles(void) diff --git a/src/rdo.h b/src/rdo.h index da6cb7d4..02b218f2 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -77,10 +77,10 @@ uint32_t kvz_get_coded_level(encoder_state_t * state, double* coded_cost, double kvz_mvd_cost_func kvz_calc_mvd_cost_cabac; -uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - int32_t mvd_hor, - int32_t mvd_ver); +double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + int32_t mvd_hor, + int32_t mvd_ver); // Number of fixed point fractional bits used in the fractional bit table. #define CTX_FRAC_BITS 15 @@ -90,8 +90,5 @@ uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, extern const uint32_t kvz_entropy_bits[512]; #define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] -// Floating point fractional bits, derived from kvz_entropy_bits -extern const float kvz_f_entropy_bits[512]; -#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] #endif diff --git a/src/sao.c b/src/sao.c index 461bdf90..1bf1ec29 100644 --- a/src/sao.c +++ b/src/sao.c @@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) { } -static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) +static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, none = 0 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type"); return mode_bits; } -static float sao_mode_bits_merge(const encoder_state_t * const state, +static double sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag"); if (merge_cand == 1) return mode_bits; - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag"); return mode_bits; } -static float sao_mode_bits_edge(const encoder_state_t * const state, +static double sao_mode_bits_edge(const encoder_state_t * const state, int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { - ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + ctx = &(cabac->ctx.sao_merge_flag_model); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, edge = 2 = cMax ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) { @@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state, } -static float sao_mode_bits_band(const encoder_state_t * const state, +static double sao_mode_bits_band(const encoder_state_t * const state, int band_position[2], int offsets[10], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded sao_type_idx_, band = 1 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets and possible FL coded offset signs. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) @@ -552,7 +554,8 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_ // Choose between SAO and doing nothing, taking into account the // rate-distortion cost of coding do nothing. { - int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5); + float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left); + int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5); if (sao_out->ddistortion >= cost_of_nothing) { sao_out->type = SAO_TYPE_NONE; merge_cost[0] = cost_of_nothing; diff --git a/src/search.c b/src/search.c index 1bdc67d5..3bd39e6b 100644 --- a/src/search.c +++ b/src/search.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "imagelist.h" #include "inter.h" #include "intra.h" @@ -59,14 +60,6 @@ // Cost threshold for doing intra search in inter frames with --rd=0. static const int INTRA_THRESHOLD = 8; -// Modify weight of luma SSD. -#ifndef LUMA_MULT -# define LUMA_MULT 0.8 -#endif -// Modify weight of chroma SSD. -#ifndef CHROMA_MULT -# define CHROMA_MULT 1.5 -#endif static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to) { @@ -225,16 +218,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); double ssd = 0.0; - ssd += LUMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_LUMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], LCU_WIDTH, LCU_WIDTH, cu_width ); if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) { - ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); - ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd( &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); @@ -294,11 +287,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, * prediction unit data needs to be coded. */ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu) + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; // cur_cu is used for TU parameters. cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -324,14 +319,36 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, return sum + tr_tree_bits * state->lambda; } + + if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) { + // Because these need to be coded before the luma cbf they also need to be counted + // before the cabac state changes. However, since this branch is only executed when + // calculating the last RD cost it is not problem to include the chroma cbf costs in + // luma, because the chroma cost is calculated right after the luma cost. + // However, if we have different tr_depth, the bits cannot be written in correct + // order anyways so do not touch the chroma cbf here. + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = cr_ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + cr_ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]); + CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); + } + } + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = tr_cu->tr_depth - tr_cu->depth; if (pred_cu->type == CU_INTRA || - tr_depth > 0 || + is_tr_split || cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[0]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]); + int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); + + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); } // SSD between reconstruction and original @@ -343,7 +360,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, width); } - { + + if (!skip_residual_coding) { int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; @@ -351,18 +369,19 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * LUMA_MULT + bits * state->lambda; + return (double)ssd * KVZ_LUMA_MULT + bits * state->lambda; } double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - cu_info_t * pred_cu, - lcu_t *const lcu) + const int x_px, const int y_px, const int depth, + cu_info_t *const pred_cu, + lcu_t *const lcu) { const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 }; const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); double tr_tree_bits = 0; double joint_cbcr_tr_tree_bits = 0; @@ -378,22 +397,27 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, return 0; } - if (depth < MAX_PU_DEPTH) { + // See luma for why the second condition + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) { const int tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]); + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = ctx; if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); } if(state->encoder_control->cfg.jccr) { joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1); } int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); - ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]); + ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } if(state->encoder_control->cfg.jccr) { - ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); + ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1); } } @@ -401,7 +425,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); - int sum = 0; + double sum = 0; sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); @@ -448,6 +472,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, } } + if (!skip_residual_coding) { int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); @@ -464,8 +489,8 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, double bits = tr_tree_bits + coeff_bits; double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits; - double cost = (double)ssd + bits * state->c_lambda; - double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda; + double cost = (double)ssd * KVZ_CHROMA_MULT + bits * state->c_lambda; + double joint_cost = (double)joint_ssd * KVZ_CHROMA_MULT + joint_bits * state->c_lambda; if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) { pred_cu->joint_cb_cr = 0; return cost; @@ -485,6 +510,117 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, return joint_cost; } +static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, + const int x_px, const int y_px, const int depth, + const cu_info_t* const pred_cu, + lcu_t* const lcu) { + const int width = LCU_WIDTH >> depth; + + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + // cur_cu is used for TU parameters. + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + + double coeff_bits = 0; + double tr_tree_bits = 0; + + // Check that lcu is not in + assert(x_px >= 0 && x_px < LCU_WIDTH); + assert(y_px >= 0 && y_px < LCU_WIDTH); + + const uint8_t tr_depth = tr_cu->tr_depth - depth; + + const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); + + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + + { + int cbf = cbf_is_set_any(pred_cu->cbf, depth); + // Only need to signal coded block flag if not skipped or merged + // skip = no coded residual, merge = coded residual + if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); + } + + } + + if(state->encoder_control->chroma_format != KVZ_CSP_400 && !skip_residual_coding) { + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb"); + } + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr"); + } + } + + if (tr_depth > 0) { + int offset = LCU_WIDTH >> (depth + 1); + double sum = 0; + + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + return sum + tr_tree_bits * state->lambda; + } + const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ; + + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = depth - tr_cu->depth; + if ((pred_cu->type == CU_INTRA || + is_tr_split || + cb_flag_u || + cb_flag_v) + && !skip_residual_coding) + { + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); + + CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); + } + // SSD between reconstruction and original + unsigned luma_ssd = 0; + if (!state->encoder_control->cfg.lossless) { + int index = y_px * LCU_WIDTH + x_px; + luma_ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width); + } + + { + int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); + const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + + coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode, tr_cu->tr_skip); + } + + unsigned chroma_ssd = 0; + if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) { + const vector2d_t lcu_px = { x_px / 2, y_px / 2 }; + const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + if (!state->encoder_control->cfg.lossless) { + int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + chroma_ssd = ssd_u + ssd_v; + } + + { + int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0); + coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0); + } + } + + double bits = tr_tree_bits + coeff_bits; + return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda; +} + // Return estimate of bits used to code prediction mode of cur_cu. static double calc_mode_bits(const encoder_state_t *state, @@ -518,6 +654,7 @@ static double calc_mode_bits(const encoder_state_t *state, } +// TODO: replace usages of this by the kvz_sort_indices_by_cost function. /** * \brief Sort modes and costs to ascending order according to costs. */ @@ -567,6 +704,23 @@ void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict traf } } +/** + * \brief Sort keys (indices) to ascending order according to costs. + */ +void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map) +{ + // Size of sorted arrays is expected to be "small". No need for faster algorithm. + for (uint8_t i = 1; i < map->size; ++i) { + const int8_t cur_indx = map->keys[i]; + const double cur_cost = map->cost[cur_indx]; + uint8_t j = i; + while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) { + map->keys[j] = map->keys[j - 1]; + --j; + } + map->keys[j] = cur_indx; + } +} static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) @@ -592,10 +746,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const encoder_control_t* ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; - double cost = MAX_INT; - double inter_zero_coeff_cost = MAX_INT; - uint32_t inter_bitcost = MAX_INT; + double cost = MAX_DOUBLE; + double inter_zero_coeff_cost = MAX_DOUBLE; + double inter_bitcost = MAX_INT; cu_info_t *cur_cu; + cabac_data_t pre_search_cabac; + memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; @@ -626,7 +782,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // Assign correct depth limit constraint_t* constr = state->constraint; - if(constr->ml_intra_depth_ctu) { + if(constr->ml_intra_depth_ctu) { pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8]; pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8]; } @@ -670,7 +826,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (can_use_inter) { double mode_cost; - uint32_t mode_bitcost; + double mode_bitcost; kvz_search_cu_inter(state, x, y, depth, @@ -721,12 +877,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max; bool can_use_intra = - WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || + (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || // When the split was forced because the CTU is partially outside // the frame, we permit intra coding even if pu_depth_intra would // otherwise forbid it. (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width || - (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height; + (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) && + !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I); if (can_use_intra && !skip_intra) { int8_t intra_mode; @@ -737,6 +894,16 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, bool mip_transposed = false; kvz_search_cu_intra(state, x, y, depth, lcu, &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default + if(state->frame->slicetype != KVZ_SLICE_I) { + double pred_mode_type_bits = 0; + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag"); + intra_cost += pred_mode_type_bits * state->lambda; + } +#endif if (intra_cost < cost) { cost = intra_cost; cur_cu->type = CU_INTRA; @@ -828,9 +995,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU - if (inter_bitcost > 1) { - inter_bitcost -= 1; - } + int skip_ctx = kvz_get_skip_context(x, y, lcu, NULL, NULL); + inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1); + inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0); + inter_bitcost += cur_cu->merge_idx; } } lcu_fill_inter(lcu, x_local, y_local, cu_width); @@ -839,20 +1007,26 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { - cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); - if (state->encoder_control->chroma_format != KVZ_CSP_400) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); + double bits = 0; + cabac_data_t* cabac = &state->search_cabac; + cabac->update = 1; + + if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) { + bits += kvz_mock_encode_coding_unit( + state, + cabac, + x, y, depth, + lcu, + cur_cu); } - - double mode_bits; - if (cur_cu->type == CU_INTRA) { - mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth); - } else { - mode_bits = inter_bitcost; + else { + assert(0); } + + cost = bits * state->lambda; - cost += mode_bits * state->lambda; - + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); + if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { cost = inter_zero_coeff_cost; @@ -874,13 +1048,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->cbf = 0; lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } - } + cabac->update = 0; + } bool can_split_cu = // If the CU is partially outside the frame, we need to split it even // if pu_depth_intra and pu_depth_inter would not permit it. cur_cu->type == CU_NOTSET || - depth < pu_depth_intra.max || + (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != KVZ_SLICE_I)) || (state->frame->slicetype != KVZ_SLICE_I && depth < pu_depth_inter.max); @@ -889,21 +1064,27 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int half_cu = cu_width / 2; double split_cost = 0.0; int cbf = cbf_is_set_any(cur_cu->cbf, depth); + cabac_data_t post_seach_cabac; + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); + state->search_cabac.update = 1; + + double split_bits = 0; if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; + cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, split_bits, "split_search"); } if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { // Add cost of intra part_size. - const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]); - cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; // 2Nx2N - split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN + cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search"); } + state->search_cabac.update = 0; + split_cost += split_bits * state->lambda; // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting @@ -925,13 +1106,29 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // searching. if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH - && x + cu_width <= frame->width && y + cu_width <= frame->height && 0) + && x + cu_width <= frame->width && y + cu_width <= frame->height + && state->encoder_control->cfg.combine_intra_cus) { + cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); // If the best CU in depth+1 is intra and the biggest it can be, try it. if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { + cabac_data_t temp_cabac; + memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; + double bits = 0; + if (depth < MAX_DEPTH) { + uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); + cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search"); + } + else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) { + // Add cost of intra part_size. + cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]); + CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, bits, "no_split_search"); + } cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; @@ -952,19 +1149,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, lcu); - cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); - if (has_chroma) { - cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); - } - - // Add the cost of coding no-split. - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - - // Add the cost of coding intra mode only once. - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth); + double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits; cost += mode_bits * state->lambda; + + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); + + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); } } @@ -978,6 +1169,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if (depth > 0) { // Copy this CU's mode all the way down for use in adjacent CUs mode // search. + memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac)); work_tree_copy_down(x_local, y_local, depth, work_tree); downsample_cclm_rec( state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] @@ -1167,6 +1359,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf, lcu_coeff_t *coeff) { + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); assert(y % LCU_WIDTH == 0); diff --git a/src/search.h b/src/search.h index 4eb5943f..db87c298 100644 --- a/src/search.h +++ b/src/search.h @@ -44,22 +44,53 @@ #include "image.h" #include "constraint.h" +#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS) + + // Modify weight of luma SSD. +#ifndef KVZ_LUMA_MULT +# define KVZ_LUMA_MULT 0.8 +#endif +// Modify weight of chroma SSD. +#ifndef KVZ_CHROMA_MULT +# define KVZ_CHROMA_MULT 1.5 +#endif + + /** + * \brief Data collected during search processes. + * + * The intended use is to collect statistics of the + * searched coding/prediction units. Data related to + * a specific unit is found at index i. The arrays + * should be indexed by elements of the "keys" array + * that will be sorted by the RD costs of the units. + */ +typedef struct unit_stats_map_t { + + cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units + double cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs + double bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs + int8_t keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays + int size; //!< number of active elements in the lists +} unit_stats_map_t; + #define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12) #define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1 void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length); +void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map); + void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff); double kvz_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu); + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu); double kvz_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - cu_info_t * pred_cu, - lcu_t *const lcu); + const int x_px, const int y_px, const int depth, + cu_info_t *const pred_cu, + lcu_t *const lcu); void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index 7c8bc0bb..73e15f95 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "image.h" #include "imagelist.h" #include "inter.h" @@ -68,7 +69,7 @@ typedef struct { /** * \brief Top-left corner of the PU */ - const vector2d_t origin; + vector2d_t origin; int32_t width; int32_t height; @@ -78,19 +79,6 @@ typedef struct { kvz_mvd_cost_func *mvd_cost_func; - /** - * \brief Best motion vector among the ones tested so far - */ - vector2d_t best_mv; - /** - * \brief Cost of best_mv - */ - uint32_t best_cost; - /** - * \brief Bit cost of best_mv - */ - uint32_t best_bitcost; - /** * \brief Possible optimized SAD implementation for the width, leave as * NULL for arbitrary-width blocks @@ -205,20 +193,25 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int /** * \brief Calculate cost for an integer motion vector. * - * Updates info->best_mv, info->best_cost and info->best_bitcost to the new + * Updates best_mv, best_cost and best_bitcost to the new * motion vector if it yields a lower cost than the current one. * * If the motion vector violates the MV constraints for tiles or WPP, the * cost is not set. * - * \return true if info->best_mv was changed, false otherwise + * \return true if best_mv was changed, false otherwise */ -static bool check_mv_cost(inter_search_info_t *info, int x, int y) +static bool check_mv_cost(inter_search_info_t *info, + int x, + int y, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { if (!intmv_within_tile(info, x, y)) return false; - uint32_t bitcost = 0; - uint32_t cost = kvz_image_calc_sad( + double bitcost = 0; + double cost = kvz_image_calc_sad( info->pic, info->ref, info->origin.x, @@ -230,25 +223,25 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) info->optimized_sad ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; cost += info->mvd_cost_func( info->state, x, y, INTERNAL_MV_PREC, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcost ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; // Set to motion vector in internal pixel precision. - info->best_mv.x = x * (1 << INTERNAL_MV_PREC); - info->best_mv.y = y * (1 << INTERNAL_MV_PREC); - info->best_cost = cost; - info->best_bitcost = bitcost; + best_mv->x = x * (1 << INTERNAL_MV_PREC); + best_mv->y = y * (1 << INTERNAL_MV_PREC); + *best_cost = cost; + *best_bits = bitcost; return true; } @@ -256,10 +249,10 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) static unsigned get_ep_ex_golomb_bitcost(unsigned symbol) { - // Calculate 2 * log2(symbol + 2) + // Calculate 2 * log2(symbol ) unsigned bins = 0; - symbol += 2; + symbol += 0; if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; } if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; } if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; } @@ -299,12 +292,16 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv) * \brief Select starting point for integer motion estimation search. * * Checks the zero vector, extra_mv and merge candidates and updates - * info->best_mv to the best one. + * best_mv to the best one. */ -static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv) +static void select_starting_point(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. - check_mv_cost(info, 0, 0); + check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv); // Change to integer precision. extra_mv.x >>= INTERNAL_MV_PREC; @@ -312,7 +309,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv // Check mv_in if it's not one of the merge candidates. if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) { - check_mv_cost(info, extra_mv.x, extra_mv.y); + check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); } // Go through candidates @@ -324,17 +321,17 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv if (x == 0 && y == 0) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } -static uint32_t get_mvd_coding_cost(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +static double get_mvd_coding_cost(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { - unsigned bitcost = 0; + double bitcost = 0; const int8_t hor_abs_gr0 = mvd_hor != 0; const int8_t ver_abs_gr0 = mvd_ver != 0; @@ -366,7 +363,7 @@ static uint32_t get_mvd_coding_cost(const encoder_state_t *state, // Round and shift back to integer bits. - return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; + return bitcost / (1 << CTX_FRAC_BITS); } @@ -374,7 +371,7 @@ static int select_mv_cand(const encoder_state_t *state, mv_t mv_cand[2][2], int32_t mv_x, int32_t mv_y, - uint32_t *cost_out) + double*cost_out) { const bool same_cand = (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]); @@ -384,7 +381,7 @@ static int select_mv_cand(const encoder_state_t *state, return 0; } - uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, + double (*mvd_coding_cost)(const encoder_state_t * const state, const cabac_data_t*, int32_t, int32_t); if (state->encoder_control->cfg.mv_rdo) { @@ -397,12 +394,12 @@ static int select_mv_cand(const encoder_state_t *state, kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd); - uint32_t cand1_cost = mvd_coding_cost( + double cand1_cost = mvd_coding_cost( state, &state->cabac, mvd.x, mvd.y); - uint32_t cand2_cost; + double cand2_cost; if (same_cand) { cand2_cost = cand1_cost; } else { @@ -423,17 +420,17 @@ static int select_mv_cand(const encoder_state_t *state, } -static uint32_t calc_mvd_cost(const encoder_state_t *state, - int x, - int y, - int mv_shift, - mv_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +static double calc_mvd_cost(const encoder_state_t *state, + int x, + int y, + int mv_shift, + mv_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + double* bitcost) { - uint32_t temp_bitcost = 0; + double temp_bitcost = 0; uint32_t merge_idx; int8_t merged = 0; @@ -456,23 +453,26 @@ static uint32_t calc_mvd_cost(const encoder_state_t *state, // Check mvd cost only if mv is not merged if (!merged) { - uint32_t mvd_cost = 0; + double mvd_cost = 0; select_mv_cand(state, mv_cand, x, y, &mvd_cost); temp_bitcost += mvd_cost; } *bitcost = temp_bitcost; - return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5); + return temp_bitcost * state->lambda_sqrt; } -static bool early_terminate(inter_search_info_t *info) +static bool early_terminate(inter_search_info_t *info, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { static const vector2d_t small_hexbs[7] = { { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, { 0, -1 }, { -1, 0 }, { 0, 0 }, }; - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; int first_index = 0; int last_index = 3; @@ -482,9 +482,9 @@ static bool early_terminate(inter_search_info_t *info) if (info->state->encoder_control->cfg.me_early_termination == KVZ_ME_EARLY_TERMINATION_SENSITIVE) { - threshold = info->best_cost * 0.95; + threshold = *best_cost * 0.95; } else { - threshold = info->best_cost; + threshold = *best_cost; } int best_index = 6; @@ -492,7 +492,7 @@ static bool early_terminate(inter_search_info_t *info) int x = mv.x + small_hexbs[i].x; int y = mv.y + small_hexbs[i].y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -502,7 +502,7 @@ static bool early_terminate(inter_search_info_t *info) mv.y += small_hexbs[best_index].y; // If best match is not better than threshold, we stop the search. - if (info->best_cost >= threshold) { + if (*best_cost >= threshold) { return true; } @@ -517,7 +517,10 @@ void kvz_tz_pattern_search(inter_search_info_t *info, unsigned pattern_type, const int iDist, vector2d_t mv, - int *best_dist) + int *best_dist, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { assert(pattern_type < 4); @@ -619,7 +622,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info, int x = mv.x + offset.x; int y = mv.y + offset.y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -632,20 +635,27 @@ void kvz_tz_pattern_search(inter_search_info_t *info, void kvz_tz_raster_search(inter_search_info_t *info, int iSearchRange, - int iRaster) + int iRaster, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { - const vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + const vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; //compute SAD values for every point in the iRaster downsampled version of the current search area for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) { for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) { - check_mv_cost(info, mv.x + x, mv.y + y); + check_mv_cost(info, mv.x + x, mv.y + y, best_cost, best_bits, best_mv); } } } -static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) +static void tz_search(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { //TZ parameters const int iSearchRange = 96; // search range for each stage @@ -657,25 +667,13 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - info->best_cost = UINT32_MAX; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) - { - return; - } - - vector2d_t start = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + + vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 }; // step 2, grid search int rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); // Break the loop if the last three rounds didn't produce a better MV. if (best_dist != iDist) rounds_without_improvement++; @@ -688,7 +686,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) start.y = 0; rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); if (best_dist != iDist) rounds_without_improvement++; if (rounds_without_improvement >= 3) break; @@ -698,7 +696,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //step 3, raster scan if (use_raster_scan && best_dist > iRaster) { best_dist = iRaster; - kvz_tz_raster_search(info, iSearchRange, iRaster); + kvz_tz_raster_search(info, iSearchRange, iRaster, best_cost, best_bits, best_mv); } //step 4 @@ -706,19 +704,19 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //raster refinement if (use_raster_refinement && best_dist > 0) { for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) { - start.x = info->best_mv.x >> INTERNAL_MV_PREC; - start.y = info->best_mv.y >> INTERNAL_MV_PREC; - kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + start.x = best_mv->x >> INTERNAL_MV_PREC; + start.y = best_mv->y >> INTERNAL_MV_PREC; + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } //star refinement (repeat step 2 for the current starting point) while (use_star_refinement && best_dist > 0) { best_dist = 0; - start.x = info->best_mv.x >> INTERNAL_MV_PREC; - start.y = info->best_mv.y >> INTERNAL_MV_PREC; + start.x = best_mv->x >> INTERNAL_MV_PREC; + start.y = best_mv->y >> INTERNAL_MV_PREC; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } } @@ -740,7 +738,12 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. */ -static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void hexagon_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -765,27 +768,14 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - info->best_cost = UINT32_MAX; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) - { - return; - } - - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. int best_index = 0; // Search the initial 7 points of the hexagon. for (int i = 1; i < 7; ++i) { - if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y)) { + if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -814,7 +804,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Iterate through the next 3 points. for (int i = 0; i < 3; ++i) { vector2d_t offset = large_hexbs[start + i]; - if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) { + if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) { best_index = start + i; } } @@ -826,7 +816,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Do the final step of the search with a small pattern. for (int i = 1; i < 9; ++i) { - check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y); + check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv); } } @@ -846,7 +836,12 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. **/ -static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void diamond_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { enum diapos { DIA_UP = 0, @@ -864,29 +859,16 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 {0, -1}, {1, 0}, {0, 1}, {-1, 0}, {0, 0} }; - - info->best_cost = UINT32_MAX; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) - { - return; - } // current motion vector - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; // current best index enum diapos best_index = DIA_CENTER; // initial search of the points of the diamond for (int i = 0; i < 5; ++i) { - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -916,7 +898,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // this is where we came from so it's checked already if (i == from_dir) continue; - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; better_found = 1; } @@ -938,12 +920,15 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 static void search_mv_full(inter_search_info_t *info, int32_t search_range, - vector2d_t extra_mv) + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // Search around the 0-vector. for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } @@ -955,7 +940,7 @@ static void search_mv_full(inter_search_info_t *info, if (!mv_in_merge(info, extra_mv)) { for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, extra_mv.x + x, extra_mv.y + y); + check_mv_cost(info, extra_mv.x + x, extra_mv.y + y, best_cost, best_bits, best_mv); } } } @@ -1002,7 +987,7 @@ static void search_mv_full(inter_search_info_t *info, } if (already_tested) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } } @@ -1015,7 +1000,10 @@ static void search_mv_full(inter_search_info_t *info, * Algoritm first searches 1/2-pel positions around integer mv and after best match is found, * refines the search by searching best 1/4-pel postion around best 1/2-pel position. */ -static void search_frac(inter_search_info_t *info) +static void search_frac(inter_search_info_t *info, + double *best_cost, + double *best_bits, + vector2d_t *best_mv) { // Map indexes to relative coordinates in the following way: // 5 3 6 @@ -1028,13 +1016,14 @@ static void search_frac(inter_search_info_t *info) }; // Set mv to pixel precision - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; - unsigned best_cost = UINT32_MAX; - uint32_t best_bitcost = 0; - uint32_t bitcosts[4] = { 0 }; + double cost = MAX_DOUBLE; + double bitcost = 0; + double bitcosts[4] = { 0 }; unsigned best_index = 0; +// Keep this as unsigned until SAD / SATD functions are updated unsigned costs[4] = { 0 }; ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE]; @@ -1100,12 +1089,12 @@ static void search_frac(inter_search_info_t *info) costs[0] += info->mvd_cost_func(state, mv.x, mv.y, INTERNAL_MV_PREC, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[0]); - best_cost = costs[0]; - best_bitcost = bitcosts[0]; + cost = costs[0]; + bitcost = bitcosts[0]; //Set mv to half-pixel precision mv.x *= 2; @@ -1160,8 +1149,8 @@ static void search_frac(inter_search_info_t *info) mv.y + pattern[j]->y, mv_shift, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[j] ); @@ -1169,9 +1158,9 @@ static void search_frac(inter_search_info_t *info) } for (int j = 0; j < 4; ++j) { - if (within_tile[j] && costs[j] < best_cost) { - best_cost = costs[j]; - best_bitcost = bitcosts[j]; + if (within_tile[j] && costs[j] < cost) { + cost = costs[j]; + bitcost = bitcosts[j]; best_index = i + j; } } @@ -1201,9 +1190,38 @@ static void search_frac(inter_search_info_t *info) mv.x *= 1 << (INTERNAL_MV_PREC - 2); mv.y *= 1 << (INTERNAL_MV_PREC - 2); - info->best_mv = mv; - info->best_cost = best_cost; - info->best_bitcost = best_bitcost; + *best_mv = mv; + *best_cost = cost; + *best_bits = bitcost; +} + +int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx) { + assert(!(lcu && cu_a)); + int context = 0; + const cu_info_t* left_pu = NULL; + const cu_info_t* top_pu = NULL; + if(lcu) { + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + if (x) { + left_pu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + if (y) { + top_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1); + } + } + else { + if (x > 0) { + left_pu = kvz_cu_array_at_const(cu_a, x - 1, y); + } + if (y > 0) { + top_pu = kvz_cu_array_at_const(cu_a, x, y - 1); + } + } + context += left_pu && left_pu->skipped; + context += top_pu && top_pu->skipped; + if (predmode_ctx) *predmode_ctx = (left_pu && left_pu->type == CU_INTRA) || (top_pu && top_pu->type == CU_INTRA); + return context; } /** @@ -1251,46 +1269,37 @@ static void apply_mv_scaling(int32_t current_poc, */ static void search_pu_inter_ref(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost, - double *best_LX_cost, - cu_info_t *unipred_LX) + lcu_t *lcu, + cu_info_t *cur_cu, + unit_stats_map_t *amvp) { const kvz_config *cfg = &info->state->encoder_control->cfg; - // which list, L0 or L1, ref_idx is in and in what index - int8_t ref_list = -1; - // the index of the ref_idx in L0 or L1 list - int8_t LX_idx; - // max value of LX_idx plus one - const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0], - info->state->frame->ref_LX_size[1]); + // Reference picture might be in both lists + bool ref_list_active[2] = { false, false }; + // Reference picture indices in L0 and L1 lists + int8_t ref_list_idx[2] = { -1, -1 }; - for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++) - { - // check if ref_idx is in L0 - if (LX_idx < info->state->frame->ref_LX_size[0] && - info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) { - ref_list = 0; - break; - } - - // check if ref_idx is in L1 - if (LX_idx < info->state->frame->ref_LX_size[1] && - info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) { - ref_list = 1; - break; + // Check if ref picture is present in the lists + for (int ref_list = 0; ref_list < 2; ++ref_list) { + for (int i = 0; i < info->state->frame->ref_LX_size[ref_list]; ++i) { + if (info->state->frame->ref_LX[ref_list][i] == info->ref_idx) { + ref_list_active[ref_list] = true; + ref_list_idx[ref_list] = i; + break; + } } } - // ref_idx has to be found in either L0 or L1 - assert(LX_idx < LX_IDX_MAX_PLUS_1); - // store temp values to be stored back later - int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; + // Must find at least one reference picture + assert(ref_list_active[0] || ref_list_active[1]); + + // Does not matter which list is used, if in both. + int ref_list = ref_list_active[0] ? 0 : 1; + int LX_idx = ref_list_idx[ref_list]; // Get MV candidates - cur_cu->inter.mv_ref[ref_list] = LX_idx; + cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; kvz_inter_get_mv_cand(info->state, info->origin.x, @@ -1302,10 +1311,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, lcu, ref_list); - // store old values back - cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - - vector2d_t mv = { 0, 0 }; + vector2d_t best_mv = { 0, 0 }; // Take starting point for MV search from previous frame. // When temporal motion vector candidates are added, there is probably @@ -1319,8 +1325,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, if (ref_cu->inter.mv_dir & 1) { mv_previous.x = ref_cu->inter.mv[0][0]; mv_previous.y = ref_cu->inter.mv[0][1]; - } - else { + } else { mv_previous.x = ref_cu->inter.mv[1][0]; mv_previous.y = ref_cu->inter.mv[1][1]; } @@ -1353,16 +1358,16 @@ static void search_pu_inter_ref(inter_search_info_t *info, info->state->frame->ref->pocs[neighbor_poc_index], info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ info->state->frame->ref->ref_LXs[neighbor_poc_index] - [col_list] + [col_list] [ref_cu->inter.mv_ref[col_list]] ], &mv_previous - ); + ); } // Check if the mv is valid after scaling if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - mv = mv_previous; + best_mv = mv_previous; } } @@ -1375,102 +1380,90 @@ static void search_pu_inter_ref(inter_search_info_t *info, default: break; } - info->best_cost = UINT32_MAX; + double best_cost = MAX_DOUBLE; + double best_bits = MAX_INT; - switch (cfg->ime_algorithm) { - case KVZ_IME_TZ: - tz_search(info, mv); - break; + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { - case KVZ_IME_FULL64: - case KVZ_IME_FULL32: - case KVZ_IME_FULL16: - case KVZ_IME_FULL8: - case KVZ_IME_FULL: - search_mv_full(info, search_range, mv); - break; + switch (cfg->ime_algorithm) { + case KVZ_IME_TZ: + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); + break; - case KVZ_IME_DIA: - diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; + case KVZ_IME_FULL64: + case KVZ_IME_FULL32: + case KVZ_IME_FULL16: + case KVZ_IME_FULL8: + case KVZ_IME_FULL: + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); + break; - default: - hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; - } + case KVZ_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; - if (cfg->fme_level > 0 && info->best_cost < *inter_cost) { - search_frac(info); - - } else if (info->best_cost < UINT32_MAX) { - // Recalculate inter cost with SATD. - info->best_cost = kvz_image_calc_satd( - info->state->tile->frame->source, - info->ref, - info->origin.x, - info->origin.y, - info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> INTERNAL_MV_PREC), - info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> INTERNAL_MV_PREC), - info->width, - info->height); - info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5); - } - - mv = info->best_mv; - - int merged = 0; - int merge_idx = 0; - // Check every candidate to find a match - for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (info->merge_cand[merge_idx].dir != 3 && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][0] == mv.x && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][1] == mv.y && - (uint32_t)info->state->frame->ref_LX[info->merge_cand[merge_idx].dir - 1][ - info->merge_cand[merge_idx].ref[info->merge_cand[merge_idx].dir - 1]] == info->ref_idx) - { - merged = 1; - break; + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; } } - // Only check when candidates are different - int cu_mv_cand = 0; - if (!merged) { - cu_mv_cand = - select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + best_cost = kvz_image_calc_satd( + info->state->tile->frame->source, + info->ref, + info->origin.x, + info->origin.y, + info->state->tile->offset_x + info->origin.x + (best_mv.x >> INTERNAL_MV_PREC), + info->state->tile->offset_y + info->origin.y + (best_mv.y >> INTERNAL_MV_PREC), + info->width, + info->height); + best_cost += best_bits * info->state->lambda_sqrt; } - if (info->best_cost < *inter_cost) { - // Map reference index to L0/L1 pictures - cur_cu->inter.mv_dir = ref_list+1; + double LX_cost[2] = { best_cost, best_cost }; + double LX_bits[2] = { best_bits, best_bits }; + + // Compute costs and add entries for both lists, if necessary + for (; ref_list < 2 && ref_list_active[ref_list]; ++ref_list) { + + LX_idx = ref_list_idx[ref_list]; uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing + LX_cost[ref_list] += extra_bits * info->state->lambda_sqrt; + LX_bits[ref_list] += extra_bits; - cur_cu->merged = merged; - cur_cu->merge_idx = merge_idx; - cur_cu->inter.mv_ref[ref_list] = LX_idx; - cur_cu->inter.mv[ref_list][0] = (mv_t)mv.x; - cur_cu->inter.mv[ref_list][1] = (mv_t)mv.y; + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { - CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); - - *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded; - } - - - // Update best unipreds for biprediction - if (info->best_cost < best_LX_cost[ref_list]) { - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { // Map reference index to L0/L1 pictures - unipred_LX[ref_list].inter.mv_dir = ref_list + 1; - unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx; - unipred_LX[ref_list].inter.mv[ref_list][0] = (mv_t)mv.x; - unipred_LX[ref_list].inter.mv[ref_list][1] = (mv_t)mv.y; + unit_stats_map_t *cur_map = &amvp[ref_list]; + int entry = cur_map->size; + cu_info_t *unipred_pu = &cur_map->unit[entry]; + *unipred_pu = *cur_cu; + unipred_pu->type = CU_INTER; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = ref_list + 1; + unipred_pu->inter.mv_ref[ref_list] = LX_idx; + unipred_pu->inter.mv[ref_list][0] = (mv_t)best_mv.x; + unipred_pu->inter.mv[ref_list][1] = (mv_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); - CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand); - - best_LX_cost[ref_list] = info->best_cost; + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; + cur_map->keys[entry] = entry; + cur_map->size++; } } } @@ -1481,9 +1474,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, */ static void search_pu_inter_bipred(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost) + lcu_t *lcu, + unit_stats_map_t *amvp_bipred) { const image_list_t *const ref = info->state->frame->ref; uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; @@ -1515,11 +1507,26 @@ static void search_pu_inter_bipred(inter_search_info_t *info, continue; } - mv_t mv[2][2]; + cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size]; + *bipred_pu = *LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + bipred_pu->inter.mv_dir = 3; + + bipred_pu->inter.mv_ref[0] = merge_cand[i].ref[0]; + bipred_pu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + int16_t(*mv)[2] = bipred_pu->inter.mv; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; + + bipred_pu->merged = false; + bipred_pu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + } // Don't try merge candidates that don't satisfy mv constraints. if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) || @@ -1541,10 +1548,10 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &frame->source->y[x + y * frame->source->stride]; - uint32_t cost = + double cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->stride); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; cost += info->mvd_cost_func(info->state, merge_cand[i].mv[0][0], @@ -1566,51 +1573,25 @@ static void search_pu_inter_bipred(inter_search_info_t *info, merge_cand[j].ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info->state->lambda_sqrt * extra_bits + 0.5; + cost += info->state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; - - cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; - cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; - cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; - cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = 0; - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].dir != 3) continue; - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } - } - - // Each motion vector has its own candidate - for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); - int cu_mv_cand = select_mv_cand( - info->state, - info->mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], - NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); - } - - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + // Each motion vector has its own candidate + for (int reflist = 0; reflist < 2; reflist++) { + int cu_mv_cand = select_mv_cand( + info->state, + info->mv_cand, + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], + NULL); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } + + bipred_pu->type = CU_INTER; + + amvp_bipred->cost[amvp_bipred->size] = cost; + amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; + amvp_bipred->keys[amvp_bipred->size] = amvp_bipred->size; + amvp_bipred->size++; } } @@ -1624,14 +1605,14 @@ static void search_pu_inter_bipred(inter_search_info_t *info, * * \return Does an identical candidate exist in list */ -static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, - inter_merge_cand_t * cand_to_add, - int8_t * added_idx_list, - int list_size) +static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, + inter_merge_cand_t *cand_to_add, + unit_stats_map_t *merge) { bool found = false; - for (int i = 0; i < list_size && !found; ++i) { - inter_merge_cand_t * list_cand = &all_cands[added_idx_list[i]]; + for (int i = 0; i < merge->size && !found; ++i) { + int key = merge->keys[i]; + inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx]; found = cand_to_add->dir == list_cand->dir && cand_to_add->ref[0] == list_cand->ref[0] && @@ -1646,7 +1627,7 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, } /** - * \brief Update PU to have best modes at this depth. + * \brief Collect PU parameters and costs at this depth. * * \param state encoder state * \param x_cu x-coordinate of the containing CU @@ -1656,28 +1637,26 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, * \param i_pu index of the PU in the CU * \param lcu containing LCU * - * \param inter_cost Return inter cost of the best mode - * \param inter_bitcost Return inter bitcost of the best mode + * \param amvp Return searched AMVP PUs sorted by costs + * \param merge Return searched Merge PUs sorted by costs */ static void search_pu_inter(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost) + int x_cu, int y_cu, + int depth, + part_mode_t part_mode, + int i_pu, + lcu_t *lcu, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + inter_search_info_t *info) { - *inter_cost = MAX_INT; - *inter_bitcost = MAX_INT; - const kvz_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); + const int width_cu = LCU_WIDTH >> depth; + const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); + const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); + const int width = PU_GET_W(part_mode, width_cu, i_pu); + const int height = PU_GET_H(part_mode, width_cu, i_pu); // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and // nRx2N partitions. @@ -1686,129 +1665,160 @@ static void search_pu_inter(encoder_state_t * const state, // 2NxnD partitions. const bool merge_b1 = i_pu == 0 || width <= height; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_NOTSET; + cur_pu->part_size = part_mode; + cur_pu->depth = depth; + cur_pu->qp = state->qp; - inter_search_info_t info = { - .state = state, - .pic = frame->source, - .origin = { x, y }, - .width = width, - .height = height, - .mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost, - .optimized_sad = kvz_get_optimized_sad(width), - }; + // Default to candidate 0 + CU_SET_MV_CAND(cur_pu, 0, 0); + CU_SET_MV_CAND(cur_pu, 1, 0); + + FILL(*info, 0); + + info->state = state; + info->pic = frame->source; + info->origin.x = x; + info->origin.y = y; + info->width = width; + info->height = height; + info->mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost; + info->optimized_sad = kvz_get_optimized_sad(width); // Search for merge mode candidates - info.num_merge_cand = kvz_inter_get_merge_cand( + info->num_merge_cand = kvz_inter_get_merge_cand( state, x, y, width, height, merge_a1, merge_b1, - info.merge_cand, + info->merge_cand, lcu ); - // Default to candidate 0 - CU_SET_MV_CAND(cur_cu, 0, 0); - CU_SET_MV_CAND(cur_cu, 1, 0); - // Merge Analysis starts here - int8_t mrg_cands[MRG_MAX_NUM_CANDS]; - double mrg_costs[MRG_MAX_NUM_CANDS]; + merge->size = 0; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - mrg_cands[i] = -1; - mrg_costs[i] = MAX_DOUBLE; + merge->keys[i] = -1; + merge->cost[i] = MAX_DOUBLE; } - int num_rdo_cands = 0; - + const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0); +#else + const double no_skip_flag = 0; +#endif // Check motion vector constraints and perform rough search - for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { - inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; + for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { - cur_cu->inter.mv_dir = cur_cand->dir; - cur_cu->inter.mv_ref[0] = cur_cand->ref[0]; - cur_cu->inter.mv_ref[1] = cur_cand->ref[1]; - cur_cu->inter.mv[0][0] = cur_cand->mv[0][0]; - cur_cu->inter.mv[0][1] = cur_cand->mv[0][1]; - cur_cu->inter.mv[1][0] = cur_cand->mv[1][0]; - cur_cu->inter.mv[1][1] = cur_cand->mv[1][1]; + inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx]; + cur_pu->inter.mv_dir = cur_cand->dir; + cur_pu->inter.mv_ref[0] = cur_cand->ref[0]; + cur_pu->inter.mv_ref[1] = cur_cand->ref[1]; + cur_pu->inter.mv[0][0] = cur_cand->mv[0][0]; + cur_pu->inter.mv[0][1] = cur_cand->mv[0][1]; + cur_pu->inter.mv[1][0] = cur_cand->mv[1][0]; + cur_pu->inter.mv[1][1] = cur_cand->mv[1][1]; // If bipred is not enabled, do not try candidates with mv_dir == 3. // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. - if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; - if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue; + if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; + if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; - bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - mrg_cands, - num_rdo_cands); + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list - if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || - !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || + bool active_L0 = cur_pu->inter.mv_dir & 1; + bool active_L1 = cur_pu->inter.mv_dir & 2; + if ((active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) || + (active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])) || is_duplicate) { continue; } kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - mrg_costs[num_rdo_cands] = kvz_satd_any_size(width, height, - lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, - lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); - - // Add cost of coding the merge index - mrg_costs[num_rdo_cands] += merge_idx * info.state->lambda_sqrt; + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_INTER; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; - mrg_cands[num_rdo_cands] = merge_idx; - num_rdo_cands++; + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + } + else { + merge->cost[merge->size] = kvz_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + bits += no_skip_flag; + merge->cost[merge->size] += bits * info->state->lambda_sqrt; + } + // Add cost of coding the merge index + merge->bits[merge->size] = bits; + merge->keys[merge->size] = merge->size; + + + merge->size++; } - // Sort candidates by cost - kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands); + assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE); + kvz_sort_keys_by_cost(merge); - // Limit by availability - // TODO: Do not limit to just 1 - num_rdo_cands = MIN(1, num_rdo_cands); + // Try early skip decision on just one merge candidate if available + int num_rdo_cands = MIN(1, merge->size); // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) { - for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { - - // Reconstruct blocks with merge candidate. - // Check luma CBF. Then, check chroma CBFs if luma CBF is not set - // and chroma exists. - // Early terminate if merge candidate with zero CBF is found. - int merge_idx = mrg_cands[merge_rdo_idx]; - inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; - - cur_cu->inter.mv_dir = cur_cand->dir; - cur_cu->inter.mv_ref[0] = cur_cand->ref[0]; - cur_cu->inter.mv_ref[1] = cur_cand->ref[1]; - cur_cu->inter.mv[0][0] = cur_cand->mv[0][0]; - cur_cu->inter.mv[0][1] = cur_cand->mv[0][1]; - cur_cu->inter.mv[1][0] = cur_cand->mv[1][0]; - cur_cu->inter.mv[1][1] = cur_cand->mv[1][1]; - - kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); - kvz_inter_recon_cu(state, lcu, x, y, width, true, false); - kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true); - - if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) { - continue; + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { + if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { + merge->size = 1; + merge->bits[0] = merge->bits[merge->keys[merge_key]]; + merge->cost[0] = merge->cost[merge->keys[merge_key]]; + merge->unit[0] = merge->unit[merge->keys[merge_key]]; + merge->keys[0] = 0; } - else if (has_chroma) { - kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true); - if (!cbf_is_set_any(cur_cu->cbf, depth)) { - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->skipped = true; - *inter_cost = 0.0; // TODO: Check this - *inter_bitcost = merge_idx; // TODO: Check this - return; + else if(cfg->rdo < 2) { + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; + kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + kvz_inter_recon_cu(state, lcu, x, y, width, true, false); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true); + + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + continue; + } + else if (has_chroma) { + kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_INTER; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; + return; + } } } } @@ -1816,16 +1826,139 @@ static void search_pu_inter(encoder_state_t * const state, // AMVP search starts here - // Store unipred information of L0 and L1 for biprediction - // Best cost will be left at MAX_DOUBLE if no valid CU is found - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - cu_info_t unipreds[2]; + amvp[0].size = 0; + amvp[1].size = 0; + amvp[2].size = 0; + + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + for (int i = 0; i < state->frame->ref->used_size; ++i) { + amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; + } + } for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { - info.ref_idx = ref_idx; - info.ref = state->frame->ref->images[ref_idx]; + info->ref_idx = ref_idx; + info->ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds); + search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); + } + + assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); + assert(amvp[1].size <= MAX_UNIT_STATS_MAP_SIZE); + kvz_sort_keys_by_cost(&amvp[0]); + kvz_sort_keys_by_cost(&amvp[1]); + + int best_keys[2] = { + amvp[0].size > 0 ? amvp[0].keys[0] : 0, + amvp[1].size > 0 ? amvp[1].keys[0] : 0 + }; + + cu_info_t *best_unipred[2] = { + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] + }; + + // Prevent using the same ref picture with both lists. + // TODO: allow searching two MVs from the same reference picture. + if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) { + + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int L0_idx = best_unipred[0]->inter.mv_ref[0]; + int L1_idx = best_unipred[1]->inter.mv_ref[1]; + + int L0_ref_idx = ref_LX[0][L0_idx]; + int L1_ref_idx = ref_LX[1][L1_idx]; + + if (L0_ref_idx == L1_ref_idx) { + // Invalidate the other based the list that has the 2nd best PU + double L0_2nd_cost = amvp[0].size > 1 ? amvp[0].cost[amvp[0].keys[1]] : MAX_DOUBLE; + double L1_2nd_cost = amvp[1].size > 1 ? amvp[1].cost[amvp[1].keys[1]] : MAX_DOUBLE; + int list = (L0_2nd_cost <= L1_2nd_cost) ? 1 : 0; + amvp[list].cost[best_keys[list]] = MAX_DOUBLE; + kvz_sort_keys_by_cost(&amvp[list]); + amvp[list].size--; + best_keys[list] = amvp[list].keys[0]; + best_unipred[list] = &amvp[list].unit[best_keys[list]]; + } + } + + // Fractional-pixel motion estimation. + // Refine the best PUs so far from both lists, if available. + for (int list = 0; list < 2; ++list) { + + // TODO: make configurable + int n_best = MIN(1, amvp[list].size); + if (cfg->fme_level > 0) { + + for (int i = 0; i < n_best; ++i) { + + int key = amvp[list].keys[i]; + cu_info_t *unipred_pu = &amvp[list].unit[key]; + + // Find the reference picture + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int LX_idx = unipred_pu->inter.mv_ref[list]; + info->ref_idx = ref_LX[list][LX_idx]; + info->ref = ref->images[info->ref_idx]; + + kvz_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + unipred_pu, + lcu, + list); + + double frac_cost = MAX_DOUBLE; + double frac_bits = MAX_INT; + vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; + + search_frac(info, &frac_cost, &frac_bits, &frac_mv); + + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL); + const int extra_bits = list + mv_ref_coded; // TODO: check if mv_dir bits are missing + frac_cost += extra_bits * info->state->lambda_sqrt; + frac_bits += extra_bits; + + bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y); + if (valid_mv) { + + unipred_pu->inter.mv[list][0] = frac_mv.x; + unipred_pu->inter.mv[list][1] = frac_mv.y; + CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); + + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + kvz_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); + } + + amvp[list].cost[key] = frac_cost; + amvp[list].bits[key] = frac_bits; + } + } + + // Invalidate PUs with SAD-based costs. (FME not performed). + // TODO: Recalculate SAD costs with SATD for further processing. + for (int i = n_best; i < amvp[list].size; ++i) { + int key = amvp[list].keys[i]; + amvp[list].cost[key] = MAX_DOUBLE; + } + } + + // Costs are now, SATD-based. Omit PUs with SAD-based costs. + // TODO: Recalculate SAD costs with SATD for further processing. + kvz_sort_keys_by_cost(&amvp[list]); + amvp[list].size = n_best; + } + + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) { + if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); } // Search bi-pred positions @@ -1835,25 +1968,39 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { + cu_info_t *bipred_pu = &amvp[2].unit[0]; + *bipred_pu = *cur_pu; + double best_bipred_cost = MAX_DOUBLE; + // Try biprediction from valid acquired unipreds. - if (best_cost_LX[0] != MAX_DOUBLE && best_cost_LX[1] != MAX_DOUBLE) { + if (amvp[0].size > 0 && amvp[1].size > 0) { // TODO: logic is copy paste from search_pu_inter_bipred. // Get rid of duplicate code asap. - const image_list_t *const ref = info.state->frame->ref; - uint8_t(*ref_LX)[16] = info.state->frame->ref_LX; + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; - inter_merge_cand_t *merge_cand = info.merge_cand; + bipred_pu->inter.mv_dir = 3; - mv_t mv[2][2]; - mv[0][0] = unipreds[0].inter.mv[0][0]; - mv[0][1] = unipreds[0].inter.mv[0][1]; - mv[1][0] = unipreds[1].inter.mv[1][0]; - mv[1][1] = unipreds[1].inter.mv[1][1]; + bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + bipred_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - kvz_inter_recon_bipred(info.state, - ref->images[ref_LX[0][unipreds[0].inter.mv_ref[0]]], - ref->images[ref_LX[1][unipreds[1].inter.mv_ref[1]]], + int16_t (*mv)[2] = bipred_pu->inter.mv; + mv[0][0] = best_unipred[0]->inter.mv[0][0]; + mv[0][1] = best_unipred[0]->inter.mv[0][1]; + mv[1][0] = best_unipred[1]->inter.mv[1][0]; + mv[1][1] = best_unipred[1]->inter.mv[1][1]; + + bipred_pu->merged = false; + bipred_pu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + } + + kvz_inter_recon_bipred(info->state, + ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], + ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], x, y, width, height, @@ -1864,104 +2011,79 @@ static void search_pu_inter(encoder_state_t * const state, const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - uint32_t cost = + + best_bipred_cost = kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; - cost += info.mvd_cost_func(info.state, - unipreds[0].inter.mv[0][0], - unipreds[0].inter.mv[0][1], + best_bipred_cost += info->mvd_cost_func(info->state, + bipred_pu->inter.mv[0][0], + bipred_pu->inter.mv[0][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[0]); - cost += info.mvd_cost_func(info.state, - unipreds[1].inter.mv[1][0], - unipreds[1].inter.mv[1][1], + best_bipred_cost += info->mvd_cost_func(info->state, + bipred_pu->inter.mv[1][0], + bipred_pu->inter.mv[1][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[1]); const uint8_t mv_ref_coded[2] = { - unipreds[0].inter.mv_ref[0], - unipreds[1].inter.mv_ref[1] + bipred_pu->inter.mv_ref[0], + bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info.state->lambda_sqrt * extra_bits + 0.5; + best_bipred_cost += info->state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = unipreds[0].inter.mv_ref[0]; - cur_cu->inter.mv_ref[1] = unipreds[1].inter.mv_ref[1]; - - cur_cu->inter.mv[0][0] = unipreds[0].inter.mv[0][0]; - cur_cu->inter.mv[0][1] = unipreds[0].inter.mv[0][1]; - cur_cu->inter.mv[1][0] = unipreds[1].inter.mv[1][0]; - cur_cu->inter.mv[1][1] = unipreds[1].inter.mv[1][1]; - cur_cu->merged = 0; - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].dir != 3) continue; - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } - } + if (best_bipred_cost < MAX_DOUBLE) { // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist); int cu_mv_cand = select_mv_cand( - info.state, - info.mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], + info->state, + info->mv_cand, + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].cost[amvp[2].size] = best_bipred_cost; + amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].keys[amvp[2].size] = amvp[2].size; + amvp[2].size++; } } // TODO: this probably should have a separate command line option - if (cfg->rdo >= 3) { - search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost); + if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); + + assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); + kvz_sort_keys_by_cost(&amvp[2]); + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } - - // Compare best merge cost to amvp cost - if (mrg_costs[0] < *inter_cost) { - *inter_cost = mrg_costs[0]; - *inter_bitcost = 0; // TODO: Check this - int merge_idx = mrg_cands[0]; - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; - cur_cu->merged = true; - cur_cu->skipped = false; - } - - if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { - assert(fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1])); + if(cfg->rdo < 2) { + int predmode_ctx; + const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); + const double part_mode_bits = state->encoder_control->cfg.smp_enable || state->encoder_control->cfg.amp_enable ? + CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1) + : 0; + const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); + const double total_bits = no_skip_flag + part_mode_bits + pred_mode_bits; + for(int i = 0; i < 3; i++) { + if(amvp[i].size > 0) { + const uint8_t best_key = amvp[i].keys[0]; + amvp[i].bits[best_key] += total_bits; + amvp[i].cost[best_key] += (total_bits)* state->lambda_sqrt; + } + } } } @@ -1985,32 +2107,92 @@ static void search_pu_inter(encoder_state_t * const state, * \param inter_bitcost Return inter bitcost */ void kvz_cu_cost_inter_rd2(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost){ - - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + int x, int y, int depth, + cu_info_t* cur_cu, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost){ + int tr_depth = MAX(1, depth); if (cur_cu->part_size != SIZE_2Nx2N) { tr_depth = depth + 1; } kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth); + const int x_px = SUB_SCU(x); + const int y_px = SUB_SCU(y); + const int width = LCU_WIDTH >> depth; + cabac_data_t cabac_copy; + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); + cabac_copy.update = 1; + + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + *cur_pu = *cur_cu; + const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); - kvz_quantize_lcu_residual(state, true, reconstruct_chroma, - x, y, depth, - NULL, - lcu, - false); - *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + int index = y_px * LCU_WIDTH + x_px; + double ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width) * KVZ_LUMA_MULT; if (reconstruct_chroma) { - *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; + double ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width / 2); + double ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width / 2); + ssd += (ssd_u + ssd_v) * KVZ_CHROMA_MULT; } + double no_cbf_bits; + double bits = 0; + const int skip_context = kvz_get_skip_context(x, y, lcu, NULL, NULL); + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost; + bits += kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); + } + else { + no_cbf_bits = kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); + bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 1); + } + double no_cbf_cost = ssd + no_cbf_bits * state->lambda; - *inter_cost += *inter_bitcost * state->lambda; + kvz_quantize_lcu_residual(state, true, reconstruct_chroma, + x, y, depth, + cur_cu, + lcu, + false); + + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + + if(cbf) { + *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); + if (reconstruct_chroma) { + *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); + } + } + else { + // If we have no coeffs after quant we already have the cost calculated + *inter_cost = no_cbf_cost; + cur_cu->cbf = 0; + *inter_bitcost = no_cbf_bits; + return; + } + + *inter_cost += (bits)* state->lambda; + *inter_bitcost = bits; + + if(no_cbf_cost < *inter_cost) { + cur_cu->cbf = 0; + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->skipped = 1; + } + *inter_cost = no_cbf_cost; + *inter_bitcost = no_cbf_bits; + + } } @@ -2032,22 +2214,80 @@ void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + search_pu_inter(state, x, y, depth, SIZE_2Nx2N, 0, lcu, - inter_cost, - inter_bitcost); + amvp, + &merge, + &info); - // Calculate more accurate cost when needed - if (state->encoder_control->cfg.rdo >= 2) { - kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + // Early Skip CU decision + if (merge.size == 1 && merge.unit[0].skipped) { + *inter_cost = merge.cost[0]; + *inter_bitcost = merge.bits[0]; + return; + } + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this + } + + if (*inter_cost == MAX_DOUBLE) { + // Could not find any motion vector. + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + return; + } + + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + *cur_pu = *best_inter_pu; + + kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), + true, state->encoder_control->chroma_format != KVZ_CSP_400); + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } } @@ -2067,14 +2307,24 @@ void kvz_search_cu_inter(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void kvz_search_cu_smp(encoder_state_t * const state, +void kvz_search_cu_smp(encoder_state_t* const state, int x, int y, int depth, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + const int num_pu = kvz_part_mode_num_parts[part_mode]; const int width = LCU_WIDTH >> depth; const int y_local = SUB_SCU(y); @@ -2088,58 +2338,94 @@ void kvz_search_cu_smp(encoder_state_t * const state, const int y_pu = PU_GET_Y(part_mode, width, y_local, i); const int width_pu = PU_GET_W(part_mode, width, i); const int height_pu = PU_GET_H(part_mode, width, i); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); - cur_pu->type = CU_INTER; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->qp = state->qp; + double cost = MAX_DOUBLE; + double bitcost = MAX_INT; - double cost = MAX_INT; - uint32_t bitcost = MAX_INT; + search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info); - search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); + cu_info_t* best_inter_pu = NULL; - if (cost >= MAX_INT) { + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + cost = amvp[mv_dir - 1].cost[best_key]; + bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + cost = merge.cost[best_merge_key]; + bitcost = 0; // TODO: Check this + } + + if (cost == MAX_DOUBLE) { // Could not find any motion vector. - *inter_cost = MAX_INT; + *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; return; } - *inter_cost += cost; + *inter_cost += cost; *inter_bitcost += bitcost; - for (int y_idx = y_pu; y_idx < y_pu + height_pu; y_idx += SCU_WIDTH) { - for (int x_idx = x_pu; x_idx < x_pu + width_pu; x_idx += SCU_WIDTH) { - cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x_idx, y_idx); + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); + *cur_pu = *best_inter_pu; + + for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) { + for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) { + cu_info_t* scu = LCU_GET_CU_AT_PX(lcu, x, y); scu->type = CU_INTER; scu->inter = cur_pu->inter; } } + + if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); + } + } + double smp_extra_bits = 0; + if (state->encoder_control->cfg.rdo < 2) { + //smp_extra_bits = kvz_encode_part_mode( + // state, + // &state->search_cabac, + // LCU_GET_CU_AT_PX(lcu, x_local, y_local), + // depth + //); + + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL, NULL)], 0, smp_extra_bits, "skip_flag"); + + // The transform is split for SMP and AMP blocks so we need more bits for + // coding the CBF. + smp_extra_bits += 6; + + *inter_bitcost += smp_extra_bits; } // Calculate more accurate cost when needed if (state->encoder_control->cfg.rdo >= 2) { kvz_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + x, y, depth, + LCU_GET_CU_AT_PX(lcu, x_local, y_local), + lcu, + inter_cost, + inter_bitcost); + } else { + *inter_cost += state->lambda_sqrt * smp_extra_bits; } - - // Count bits spent for coding the partition mode. - int smp_extra_bits = 1; // horizontal or vertical - if (state->encoder_control->cfg.amp_enable) { - smp_extra_bits += 1; // symmetric or asymmetric - if (part_mode != SIZE_2NxN && part_mode != SIZE_Nx2N) { - smp_extra_bits += 1; // U,L or D,R - } - } - // The transform is split for SMP and AMP blocks so we need more bits for - // coding the CBF. - smp_extra_bits += 6; - - *inter_cost += (state->encoder_control->cfg.rdo >= 2 ? state->lambda : state->lambda_sqrt) * smp_extra_bits; - *inter_bitcost += smp_extra_bits; } diff --git a/src/search_inter.h b/src/search_inter.h index 5aff9f7f..cc003f92 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -64,20 +64,20 @@ enum hpel_position { HPEL_POS_DIA = 2 }; -typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state, +typedef double kvz_mvd_cost_func(const encoder_state_t *state, int x, int y, int mv_shift, mv_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost); + double *bitcost); void kvz_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); void kvz_search_cu_smp(encoder_state_t * const state, int x, int y, @@ -85,12 +85,20 @@ void kvz_search_cu_smp(encoder_state_t * const state, part_mode_t part_mode, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); unsigned kvz_inter_satd_cost(const encoder_state_t* state, const lcu_t *lcu, int x, int y); +void kvz_cu_cost_inter_rd2(encoder_state_t* const state, + int x, int y, int depth, + cu_info_t* cur_cu, + lcu_t* lcu, + double* inter_cost, + double* inter_bitcost); + +int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx); #endif // SEARCH_INTER_H_ diff --git a/src/search_intra.c b/src/search_intra.c index 87139b93..6f7a9349 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -97,13 +97,13 @@ static double get_cost(encoder_state_t * const state, // Add the offset bit costs of signaling 'luma and chroma use trskip', // versus signaling 'luma and chroma don't use trskip' to the SAD cost. - const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma; + const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma; double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0); // ToDo: Check cost if (state->encoder_control->chroma_format != KVZ_CSP_400) { - ctx = &state->cabac.ctx.transform_skip_model_chroma; + ctx = &state->search_cabac.ctx.transform_skip_model_chroma; trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); } @@ -394,7 +394,7 @@ static double search_intra_trdepth(encoder_state_t * const state, // max_depth. // - Min transform size hasn't been reached (MAX_PU_DEPTH). if (depth < max_depth && depth < MAX_PU_DEPTH) { - split_cost = 3 * state->lambda; + split_cost = 0; split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); if (split_cost < nosplit_cost) { @@ -417,14 +417,15 @@ static double search_intra_trdepth(encoder_state_t * const state, // so this will code cbf as 0 and not code the cbf at all for descendants. if (state->encoder_control->chroma_format != KVZ_CSP_400) { const uint8_t tr_depth = depth - pred_cu->depth; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; - const cabac_ctx_t* ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]); + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); } ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); } } @@ -677,9 +678,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Add prediction mode coding cost as the last thing. We don't want this // affecting the halving search. - int lambda_cost = (int)(state->lambda_sqrt + 0.5); for (int mode_i = 0; mode_i < modes_selected; ++mode_i) { - costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0); + costs[mode_i] += state->lambda_sqrt * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0); } #undef PARALLEL_BLKS @@ -771,7 +771,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, int rdo_bitcost = kvz_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id); *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5); - + // Mip related stuff // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream. // Half of the modes [16, 31] are indicated with the separate transpose flag. @@ -818,6 +818,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state, } + // The best transform split hierarchy is not saved anywhere, so to get the // transform split hierarchy the search has to be performed again with the // best mode. @@ -854,7 +855,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state, double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id) { - double mode_bits = 0.0; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; + double mode_bits = 0; bool enable_mip = state->encoder_control->cfg.mip; bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false; @@ -899,11 +901,26 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const break; } } + cabac_ctx_t *ctx = &(cabac->ctx.luma_planar_model[1]); + CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search"); + if (state->search_cabac.update) { + if(mode_in_preds) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx"); + if(luma_mode != intra_preds[0]) { + CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx"); + } + } + else { + // This value should be transformed for actual coding, + // but here the value does not actually matter, just that we write 5 bits + CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode"); + } + } bool enable_mrl = state->encoder_control->cfg.mrl; uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0; - const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model); + ctx = &(cabac->ctx.intra_luma_mpm_flag_model); if (multi_ref_index == 0) { mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1); @@ -911,17 +928,17 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const // Add MRL bits. if (enable_mrl && MAX_REF_LINE_IDX > 1) { - ctx = &(state->cabac.ctx.multi_ref_line[0]); + ctx = &(cabac->ctx.multi_ref_line[0]); mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0); if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) { - ctx = &(state->cabac.ctx.multi_ref_line[1]); + ctx = &(cabac->ctx.multi_ref_line[1]); mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1); } } if (mode_in_preds != -1 || multi_ref_index != 0) { - ctx = &(state->cabac.ctx.luma_planar_model[0]); + ctx = &(cabac->ctx.luma_planar_model[0]); if (multi_ref_index == 0) { mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0); } @@ -938,7 +955,8 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model); + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model); double mode_bits; if (chroma_mode == luma_mode) { mode_bits = CTX_ENTROPY_FBITS(ctx, 0); @@ -958,6 +976,13 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67); } + if(cabac->update) { + if(chroma_mode != luma_mode) { + // Again it does not matter what we actually write here + CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode"); + } + } + return mode_bits; } @@ -1045,9 +1070,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state, -1, chroma.mode, // skip luma NULL, cclm_params, 0, false, false, lcu); } + double bits = 0; chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode); + bits += mode_bits; chroma.cost += mode_bits * state->lambda; if (chroma.cost < best_chroma.cost) { diff --git a/src/transform.c b/src/transform.c index 09de7b2c..4c90f3f4 100644 --- a/src/transform.c +++ b/src/transform.c @@ -260,11 +260,9 @@ int kvz_quantize_residual_trskip( struct { kvz_pixel rec[LCU_WIDTH * LCU_WIDTH]; coeff_t coeff[LCU_WIDTH * LCU_WIDTH]; - uint32_t cost; + double cost; int has_coeffs; } skip, *best; - - const int bit_cost = (int)(state->lambda + 0.5); //noskip.has_coeffs = kvz_quantize_residual( // state, cur_cu, width, color, scan_order, @@ -278,7 +276,7 @@ int kvz_quantize_residual_trskip( 1, in_stride, width, ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj); skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, width, width); - skip.cost += kvz_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * bit_cost; + skip.cost += kvz_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * state->frame->lambda; /* if (noskip.cost <= skip.cost) { *trskip_out = 0;