diff --git a/README.md b/README.md index 5027547d..0058a2c7 100644 --- a/README.md +++ b/README.md @@ -145,11 +145,20 @@ Video structure: - frametile: Constrain within the tile. - frametilemargin: Constrain even more. --roi : Use a delta QP map for region of interest. - Reads an array of delta QP values from a text - file. The file format is: width and height of - the QP delta map followed by width*height delta - QP values in raster order. The map can be of any - size and will be scaled to the video size. + Reads an array of delta QP values from a file. + Text and binary files are supported and detected + from the file extension (.txt/.bin). If a known + extension is not found, the file is treated as + a text file. The file can include one or many + ROI frames each in the following format: + width and height of the QP delta map followed + by width * height delta QP values in raster + order. In binary format, width and height are + 32-bit integers whereas the delta QP values are + signed 8-bit values. The map can be of any size + and will be scaled to the video size. The file + reading will loop if end of the file is reached. + See roi.txt in the examples folder. --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26. in PPS and slice_qp_delta in slize header zero. --(no-)erp-aqp : Use adaptive QP for 360 degree video with diff --git a/doc/uvg266.1 b/doc/uvg266.1 index e3657ea5..7a4319f8 100644 --- a/doc/uvg266.1 +++ b/doc/uvg266.1 @@ -164,11 +164,20 @@ Constrain movement vectors. [none] .TP \fB\-\-roi Use a delta QP map for region of interest. -Reads an array of delta QP values from a text -file. The file format is: width and height of -the QP delta map followed by width*height delta -QP values in raster order. The map can be of any -size and will be scaled to the video size. +Reads an array of delta QP values from a file. +Text and binary files are supported and detected +from the file extension (.txt/.bin). If a known +extension is not found, the file is treated as +a text file. The file can include one or many +ROI frames each in the following format: +width and height of the QP delta map followed +by width * height delta QP values in raster +order. In binary format, width and height are +32\-bit integers whereas the delta QP values are +signed 8\-bit values. The map can be of any size +and will be scaled to the video size. The file +reading will loop if end of the file is reached. +See roi.txt in the examples folder. .TP \fB\-\-set\-qp\-in\-cu Set QP at CU level keeping pic_init_qp_minus26. diff --git a/src/alf.c b/src/alf.c index 7793c483..ff312627 100644 --- a/src/alf.c +++ b/src/alf.c @@ -1236,19 +1236,19 @@ static void code_alf_ctu_filter_index(encoder_state_t * const state, assert(filter_set_idx < num_available_filt_sets); //"temporal non-latest set" if (num_aps > 1) { - uvg_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS); + uvg_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS, NULL); } } else { assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //"fixed set larger than temporal" - uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS); + uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL); } } else { assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //Fixed set numavail < num_fixed - uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS); + uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL); } } diff --git a/src/bitstream.c b/src/bitstream.c index ce243f51..9d6de07e 100644 --- a/src/bitstream.c +++ b/src/bitstream.c @@ -33,6 +33,7 @@ #include "bitstream.h" #include +#include #include #include diff --git a/src/cabac.c b/src/cabac.c index 794e4de5..9f33b503 100644 --- a/src/cabac.c +++ b/src/cabac.c @@ -70,6 +70,7 @@ void uvg_cabac_start(cabac_data_t * const data) data->num_buffered_bytes = 0; data->buffered_byte = 0xff; data->only_count = 0; // By default, write bits out + data->update = 0; } /** @@ -199,7 +200,7 @@ void uvg_cabac_encode_bin_trm(cabac_data_t * const data, const uint8_t bin_value /** * \brief encode truncated binary code */ -void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value) { +void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value, double* bits_out) { int thresh; int symbol = bin_value; if (max_value > 256) { @@ -219,9 +220,11 @@ void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_va int b = max_value - val; if (symbol < val - b) { CABAC_BINS_EP(data, symbol, thresh, "TruncSymbols"); + if (bits_out) *bits_out += thresh; } else { symbol += val - b; CABAC_BINS_EP(data, symbol, thresh + 1, "TruncSymbols"); + if (bits_out) *bits_out += thresh + 1; } } @@ -349,26 +352,30 @@ void uvg_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t rem /** * \brief */ -void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol) +void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, + cabac_ctx_t * const ctx, + uint32_t symbol, + const int32_t offset, + const uint32_t max_symbol, + double* bits_out) { int8_t code_last = max_symbol > symbol; assert(symbol <= max_symbol); if (!max_symbol) return; - - data->cur_ctx = ctx; - CABAC_BIN(data, symbol, "ums"); + + CABAC_FBITS_UPDATE(data, ctx, symbol, *bits_out, "ums"); if (!symbol) return; data->cur_ctx = &ctx[offset]; while (--symbol) { - CABAC_BIN(data, 1, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums"); } if (code_last) { - CABAC_BIN(data, 0, "ums"); + CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums"); } } @@ -405,7 +412,7 @@ void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int /** * \brief */ -void uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state, +uint32_t uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state, cabac_data_t * const data, uint32_t symbol, uint32_t count) @@ -426,4 +433,5 @@ void uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state, num_bins += count; CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb"); + return num_bins; } diff --git a/src/cabac.h b/src/cabac.h index d642787f..0088d5d9 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -59,7 +59,8 @@ typedef struct uint32_t buffered_byte; int32_t num_buffered_bytes; int32_t bits_left; - int8_t only_count; + int8_t only_count : 4; + int8_t update : 4; bitstream_t *stream; // CONTEXTS @@ -133,18 +134,18 @@ extern const uint8_t uvg_g_auc_renorm_table[32]; void uvg_cabac_start(cabac_data_t *data); void uvg_cabac_encode_bin(cabac_data_t *data, uint32_t bin_value); void uvg_cabac_encode_bin_ep(cabac_data_t *data, uint32_t bin_value); -void uvg_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value); +void uvg_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value, double* bits_out); void uvg_cabac_encode_bins_ep(cabac_data_t *data, uint32_t bin_values, int num_bins); void uvg_cabac_encode_bin_trm(cabac_data_t *data, uint8_t bin_value); void uvg_cabac_write(cabac_data_t *data); void uvg_cabac_finish(cabac_data_t *data); void uvg_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol, uint32_t r_param, const unsigned int cutoff); -void uvg_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, +uint32_t uvg_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data, uint32_t symbol, uint32_t count); void uvg_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx, - uint32_t symbol, int32_t offset, - uint32_t max_symbol); + uint32_t symbol, int32_t offset, + uint32_t max_symbol, double* bits_out); void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol); #define CTX_PROB_BITS 15 @@ -153,6 +154,18 @@ void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol #define CTX_MASK_0 (~(~0u << CTX_PROB_BITS_0) << (CTX_PROB_BITS - CTX_PROB_BITS_0)) #define CTX_MASK_1 (~(~0u << CTX_PROB_BITS_1) << (CTX_PROB_BITS - CTX_PROB_BITS_1)) +// Floating point fractional bits, derived from kvz_entropy_bits +extern const float uvg_f_entropy_bits[512]; +#define CTX_ENTROPY_FBITS(ctx, val) uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] + +#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \ + if((cabac)->only_count) (bits) += uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \ + if((cabac)->update) {\ + (cabac)->cur_ctx = ctx;\ + CABAC_BIN((cabac), (val), (name));\ + } \ +} while(0) + // Macros #define CTX_GET_STATE(ctx) ( (ctx)->state[0]+(ctx)->state[1] ) #define CTX_STATE(ctx) ( CTX_GET_STATE(ctx)>>8 ) @@ -185,23 +198,23 @@ extern uint32_t uvg_cabac_bins_count; extern bool uvg_cabac_bins_verbose; #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = CTX_STATE(data->cur_ctx); \ - if(uvg_cabac_bins_verbose && !data->only_count) {printf("%d %d [%d:%d] %s = %u, range = %u LPS = %u state = %u -> ", \ - uvg_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS(data->cur_ctx,(data)->range), CTX_LPS(data->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS(data->cur_ctx,(data)->range), prev_state); }\ + if(uvg_cabac_bins_verbose && !(data)->only_count) {printf("%d %d [%d:%d] %s = %u, range = %u LPS = %u state = %u -> ", \ + uvg_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS((data)->cur_ctx,(data)->range), CTX_LPS((data)->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS((data)->cur_ctx,(data)->range), prev_state); }\ uvg_cabac_encode_bin((data), (value)); \ - if(uvg_cabac_bins_verbose && !data->only_count) printf("%u\n", CTX_STATE(data->cur_ctx)); } + if(uvg_cabac_bins_verbose && !(data)->only_count) printf("%u\n", CTX_STATE((data)->cur_ctx)); } #define CABAC_BINS_EP(data, value, bins, name) { \ - uint32_t prev_state = CTX_STATE(data->cur_ctx); \ + uint32_t prev_state = (!(data)->only_count) ? CTX_STATE(data->cur_ctx) : 0; \ uvg_cabac_encode_bins_ep((data), (value), (bins)); \ if(uvg_cabac_bins_verbose && !data->only_count) { printf("%d %s = %u(%u bins), state = %u -> %u\n", \ - uvg_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE(data->cur_ctx)); uvg_cabac_bins_count+=bins;}} + uvg_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE((data)->cur_ctx)); uvg_cabac_bins_count+=(bins);}} #define CABAC_BIN_EP(data, value, name) { \ - uint32_t prev_state = CTX_STATE(data->cur_ctx); \ + uint32_t prev_state = (!(data)->only_count) ? CTX_STATE((data)->cur_ctx) : 0;; \ uvg_cabac_encode_bin_ep((data), (value)); \ - if(uvg_cabac_bins_verbose && !data->only_count) {printf("%d %s = %u, state = %u -> %u\n", \ - uvg_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE(data->cur_ctx)); }} + if(uvg_cabac_bins_verbose && !(data)->only_count) {printf("%d %s = %u, state = %u -> %u\n", \ + uvg_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE((data)->cur_ctx)); }} #else #define CABAC_BIN(data, value, name) \ uvg_cabac_encode_bin((data), (value)); diff --git a/src/cfg.c b/src/cfg.c index 8147bcdb..96a24bb1 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -147,9 +147,9 @@ int uvg_config_init(uvg_config *cfg) cfg->gop_lp_definition.t = 1; cfg->open_gop = true; - cfg->roi.width = 0; - cfg->roi.height = 0; - cfg->roi.dqps = NULL; + cfg->roi.file_path = NULL; + cfg->roi.format = UVG_ROI_TXT; + cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -212,6 +212,9 @@ int uvg_config_init(uvg_config *cfg) cfg->cclm = 0; + + cfg->combine_intra_cus = 1; + cfg->force_inter = 0; return 1; } @@ -219,11 +222,11 @@ int uvg_config_destroy(uvg_config *cfg) { if (cfg) { FREE_POINTER(cfg->cqmfile); + FREE_POINTER(cfg->roi.file_path); FREE_POINTER(cfg->fast_coeff_table_fn); FREE_POINTER(cfg->tiles_width_split); FREE_POINTER(cfg->tiles_height_split); FREE_POINTER(cfg->slice_addresses_in_ts); - FREE_POINTER(cfg->roi.dqps); FREE_POINTER(cfg->fastrd_learning_outdir_fn); } free(cfg); @@ -1269,60 +1272,29 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) } else if OPT("implicit-rdpcm") cfg->implicit_rdpcm = (bool)atobool(value); + else if OPT("roi") { - // The ROI description is as follows: - // First number is width, second number is height, - // then follows width * height number of dqp values. - FILE* f = fopen(value, "rb"); - if (!f) { - fprintf(stderr, "Could not open ROI file.\n"); + static enum uvg_roi_format const formats[] = { UVG_ROI_TXT, UVG_ROI_BIN }; + static const char * const format_names[] = { "txt", "bin", NULL }; + + char *roi_file = strdup(value); + if (!roi_file) { + fprintf(stderr, "Failed to allocate memory for ROI file name.\n"); return 0; } + FREE_POINTER(cfg->roi.file_path); + cfg->roi.file_path = roi_file; - int width = 0; - int height = 0; - if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) { - fprintf(stderr, "Failed to read ROI size.\n"); - fclose(f); - return 0; + // Get file extension or the substring after the last dot + char *maybe_extension = strrchr(cfg->roi.file_path, '.'); + if (!maybe_extension) { + cfg->roi.format = UVG_ROI_TXT; + } else { + maybe_extension++; + int8_t format; + bool unknown_format = !parse_enum(maybe_extension, format_names, &format); + cfg->roi.format = unknown_format ? UVG_ROI_TXT : formats[format]; } - - if (width <= 0 || height <= 0) { - fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height); - fclose(f); - return 0; - } - - if (width > 10000 || height > 10000) { - fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); - fclose(f); - return 0; - } - - const unsigned size = width * height; - int8_t *dqp_array = calloc((size_t)size, sizeof(cfg->roi.dqps[0])); - if (!dqp_array) { - fprintf(stderr, "Failed to allocate memory for ROI table.\n"); - fclose(f); - return 0; - } - - FREE_POINTER(cfg->roi.dqps); - cfg->roi.dqps = dqp_array; - cfg->roi.width = width; - cfg->roi.height = height; - - for (int i = 0; i < size; ++i) { - int number; // Need a pointer to int for fscanf - if (fscanf(f, "%d", &number) != 1) { - fprintf(stderr, "Reading ROI file failed.\n"); - fclose(f); - return 0; - } - dqp_array[i] = CLIP(-51, 51, number); - } - - fclose(f); } else if OPT("set-qp-in-cu") { cfg->set_qp_in_cu = (bool)atobool(value); @@ -1476,6 +1448,12 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) else if OPT("cclm") { cfg->cclm = (bool)atobool(value); } + else if OPT("combine-intra-cus") { + cfg->combine_intra_cus = atobool(value); + } + else if OPT("force-inter") { + cfg->force_inter = atobool(value); + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index b4f920bc..9fd36359 100644 --- a/src/cli.c +++ b/src/cli.c @@ -141,6 +141,7 @@ static const struct option long_options[] = { { "force-level", required_argument, NULL, 0 }, { "high-tier", no_argument, NULL, 0 }, { "me-steps", required_argument, NULL, 0 }, + { "roi-file", required_argument, NULL, 0 }, { "fast-residual-cost", required_argument, NULL, 0 }, { "set-qp-in-cu", no_argument, NULL, 0 }, { "open-gop", no_argument, NULL, 0 }, @@ -179,6 +180,10 @@ static const struct option long_options[] = { { "no-amvr", no_argument, NULL, 0 }, { "cclm", no_argument, NULL, 0 }, { "no-cclm", no_argument, NULL, 0 }, + { "combine-intra-cus", no_argument, NULL, 0 }, + { "no-combine-intra-cus", no_argument, NULL, 0 }, + { "force-inter", no_argument, NULL, 0 }, + { "no-force-inter", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -499,11 +504,20 @@ void print_help(void) " - frametile: Constrain within the tile.\n" " - frametilemargin: Constrain even more.\n" " --roi : Use a delta QP map for region of interest.\n" - " Reads an array of delta QP values from a text\n" - " file. The file format is: width and height of\n" - " the QP delta map followed by width*height delta\n" - " QP values in raster order. The map can be of any\n" - " size and will be scaled to the video size.\n" + " Reads an array of delta QP values from a file.\n" + " Text and binary files are supported and detected\n" + " from the file extension (.txt/.bin). If a known\n" + " extension is not found, the file is treated as\n" + " a text file. The file can include one or many\n" + " ROI frames each in the following format:\n" + " width and height of the QP delta map followed\n" + " by width * height delta QP values in raster\n" + " order. In binary format, width and height are\n" + " 32-bit integers whereas the delta QP values are\n" + " signed 8-bit values. The map can be of any size\n" + " and will be scaled to the video size. The file\n" + " reading will loop if end of the file is reached.\n" + " See roi.txt in the examples folder.\n" " --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26.\n" " in PPS and slice_qp_delta in slize header zero.\n" " --(no-)erp-aqp : Use adaptive QP for 360 degree video with\n" @@ -587,6 +601,16 @@ void print_help(void) " --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n" " learning trees, overrides the\n" " --pu-depth-intra parameter. [disabled]\n" + " --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n" + " on lower depth even when search is not\n" + " performed on said depth. Should only\n" + " be disabled if cus absolutely must not\n" + " be larger than limited by the search.\n" + " [enabled]" + " --force-inter : Force the encoder to use inter always.\n" + " This is mostly for debugging and is not\n" + " guaranteed to produce sensible bitstream or\n" + " work at all. [disabled]" " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" diff --git a/src/cu.h b/src/cu.h index d1d3ae6d..496c73ac 100644 --- a/src/cu.h +++ b/src/cu.h @@ -148,7 +148,7 @@ typedef struct uint8_t merge_idx : 3; //!< \brief merge index uint8_t tr_skip : 1; //!< \brief transform skip flag uint8_t tr_idx : 3; //!< \brief transform index - uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding + uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding uint16_t cbf; @@ -183,6 +183,16 @@ typedef struct }; } cu_info_t; +typedef struct { + int16_t x; + int16_t y; + int8_t width; + int8_t height; + int8_t chroma_width; + int8_t chroma_height; +} cu_loc_t; + + #define CU_GET_MV_CAND(cu_info_ptr, reflist) \ (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1) diff --git a/src/encmain.c b/src/encmain.c index 2ca3bdc3..b04edd9d 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -441,6 +441,7 @@ int main(int argc, char *argv[]) FILE *input = NULL; //!< input file (YUV) FILE *output = NULL; //!< output file (HEVC NAL stream) FILE *recout = NULL; //!< reconstructed YUV output, --debug + FILE *roifile = NULL; clock_t start_time = clock(); clock_t encoding_start_cpu_time; UVG_CLOCK_T encoding_start_real_time; @@ -587,7 +588,7 @@ int main(int argc, char *argv[]) // Give arguments via struct to the input thread input_handler_args in_args = { .available_input_slots = available_input_slots, - .filled_input_slots = filled_input_slots, + .filled_input_slots = filled_input_slots, .input = input, .api = api, @@ -828,6 +829,7 @@ done: if (input) fclose(input); if (output) fclose(output); if (recout) fclose(recout); + if (roifile) fclose(roifile); DBG_YUVIEW_CLEANUP(); CHECKPOINTS_FINALIZE(); diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 65d7ab24..f63a8bef 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -352,8 +352,8 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac, } static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff, uint8_t joint_chroma) { - int x_local = (x >> 1) % LCU_WIDTH_C; - int y_local = (y >> 1) % LCU_WIDTH_C; + int x_local = ((x & ~7) >> 1) % LCU_WIDTH_C; + int y_local = ((y & ~7) >> 1) % LCU_WIDTH_C; cabac_data_t* const cabac = &state->cabac; *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth); if(!joint_chroma){ @@ -367,7 +367,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep // TODO: transform skip for chroma blocks CABAC_BIN(cabac, 0, "transform_skip_flag"); } - uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false); + uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, NULL, false); } if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) { @@ -375,7 +375,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; CABAC_BIN(cabac, 0, "transform_skip_flag"); } - uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false); + uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, NULL, false); } } else { @@ -384,7 +384,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; CABAC_BIN(cabac, 0, "transform_skip_flag"); } - uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, 2, *scan_idx, NULL, false); + uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, NULL, false); } } @@ -444,8 +444,6 @@ static void encode_transform_unit(encoder_state_t * const state, } else { // Time to to code the chroma transform blocks. Move to the top-left // corner of the block. - x -= 4; - y -= 4; cur_pu = uvg_cu_array_at_const((const cu_array_t *)frame->cu_array, x, y); } } @@ -485,7 +483,7 @@ static void encode_transform_coeff(encoder_state_t * const state, // containing CU. const int x_cu = 8 * (x / 8); const int y_cu = 8 * (y / 8); - const cu_info_t *cur_cu = uvg_cu_array_at_const(frame->cu_array, x_cu, y_cu); + const cu_info_t *cur_cu = uvg_cu_array_at_const(frame->cu_array, x, y); // NxN signifies implicit transform split at the first transform level. // There is a similar implicit split for inter, but it is only used when @@ -507,8 +505,8 @@ static void encode_transform_coeff(encoder_state_t * const state, const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y); - const int cb_flag_u = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U); - const int cb_flag_v = cur_pu->joint_cb_cr ? ((cur_pu->joint_cb_cr & 2) >> 1) : cbf_is_set(cur_cu->cbf, depth, COLOR_V); + const int cb_flag_u = cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V); // The split_transform_flag is not signaled when: // - transform size is greater than 32 (depth == 0) @@ -580,7 +578,7 @@ static void encode_transform_coeff(encoder_state_t * const state, cabac_data_t* cabac = &state->cabac; // cu_qp_delta_abs prefix - uvg_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5); + uvg_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL); if (qp_delta_abs >= 5) { // cu_qp_delta_abs suffix @@ -593,7 +591,13 @@ static void encode_transform_coeff(encoder_state_t * const state, state->must_code_qp_delta = false; } - if((cb_flag_u || cb_flag_v ) && (depth != 4 || only_chroma) && state->encoder_control->cfg.jccr) { + if(( + ((cb_flag_u || cb_flag_v ) + && cur_cu->type == CU_INTRA) + || (cb_flag_u && cb_flag_v)) + && (depth != 4 || only_chroma) + && state->encoder_control->cfg.jccr + ) { cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1]; CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag"); } @@ -609,17 +613,19 @@ static void encode_transform_coeff(encoder_state_t * const state, * \param depth Depth from LCU. * \return if non-zero mvd is coded */ -static bool encode_inter_prediction_unit(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int width, int height, - int depth) +int uvg_encode_inter_prediction_unit(encoder_state_t * const state, + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + int x, int y, int width, int height, + int depth, lcu_t* lcu, double* bits_out) { // Mergeflag int16_t num_cand = 0; bool non_zero_mvd = false; - cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, cur_cu->merged, "MergeFlag"); + double bits = 0; + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag"); + num_cand = state->encoder_control->cfg.max_merge; if (cur_cu->merged) { //merge if (num_cand > 1) { @@ -627,10 +633,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != cur_cu->merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac,symbol,"MergeIndex"); + if(cabac->only_count) bits += 1; } if (symbol == 0) break; } @@ -649,12 +655,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4 uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height) + 1) >> 1)); - cabac->cur_ctx = &(cabac->ctx.inter_dir[inter_dir_ctx]); - CABAC_BIN(cabac, (inter_dir == 3), "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc"); } if (inter_dir < 3) { - cabac->cur_ctx = &(cabac->ctx.inter_dir[5]); - CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[5]), (inter_dir == 2), bits, "inter_pred_idc"); } } @@ -673,20 +677,21 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, if (ref_LX_size > 1) { // parseRefFrmIdx int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame > 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0 && ref_LX_size > 2) { cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); - CABAC_BIN(cabac, (ref_frame > 1), "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), (ref_frame > 1), bits, "ref_idx_lX"); if (ref_frame > 1 && ref_LX_size > 3) { for (int idx = 3; idx < ref_LX_size; idx++) { uint8_t val = (ref_frame > idx - 1) ? 1 : 0; CABAC_BIN_EP(cabac, val, "ref_idx_lX"); + if (cabac->only_count) bits += 1; if (!val) break; + } } } @@ -696,39 +701,45 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state, if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) { mv_t mv_cand[2][2]; - uvg_inter_get_mv_cand_cua( + if (lcu) { + uvg_inter_get_mv_cand( + state, + x, y, width, height, + mv_cand, cur_cu, + lcu, ref_list_idx); + } + else { + uvg_inter_get_mv_cand_cua( state, x, y, width, height, - mv_cand, cur_cu, ref_list_idx); + mv_cand, cur_cu, ref_list_idx + ); + } uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); mv_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; mv_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; uvg_change_precision(INTERNAL_MV_PREC, uvg_g_imv_to_prec[UVG_IMV_OFF], &mvd_hor, &mvd_ver); - - uvg_encode_mvd(state, cabac, mvd_hor, mvd_ver); + uvg_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out); non_zero_mvd |= (mvd_hor != 0) || (mvd_ver != 0); } // Signal which candidate MV to use - cabac->cur_ctx = &(cabac->ctx.mvp_idx_model); - CABAC_BIN(cabac, CU_GET_MV_CAND(cur_cu, ref_list_idx), "mvp_flag"); + CABAC_FBITS_UPDATE(cabac,&(cabac->ctx.mvp_idx_model), CU_GET_MV_CAND(cur_cu, ref_list_idx), bits, "mvp_flag"); } // for ref_list } // if !merge + if(bits_out) *bits_out += bits; return non_zero_mvd; } -static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width, const int cclm_enabled) { +static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, const int cclm_enabled) { unsigned pred_mode = 0; unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83}; - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, 0); - const cu_info_t *first_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y); - int8_t chroma_intra_dir = first_pu->intra.mode_chroma; - int8_t luma_intra_dir = first_pu->intra.mode; + int8_t chroma_intra_dir = cur_cu->intra.mode_chroma; + int8_t luma_intra_dir = cur_cu->intra.mode; bool derived_mode = chroma_intra_dir == luma_intra_dir; @@ -803,19 +814,20 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c } } -static void encode_intra_coding_unit(encoder_state_t * const state, +void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, cabac_data_t * const cabac, const cu_info_t * const cur_cu, - int x, int y, int depth, lcu_coeff_t* coeff) + int x, int y, int depth, const lcu_t* lcu, double* bits_out) { const videoframe_t * const frame = state->tile->frame; - uint8_t intra_pred_mode_actual[4]; - uint8_t *intra_pred_mode = intra_pred_mode_actual; + uint8_t intra_pred_mode_actual; + uint8_t *intra_pred_mode = &intra_pred_mode_actual; //uint8_t intra_pred_mode_chroma = cur_cu->intra.mode_chroma; - int8_t intra_preds[4][INTRA_MPM_COUNT] = {{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1}}; - int8_t mpm_preds[4] = {-1, -1, -1, -1}; - uint32_t flag[4]; + int8_t intra_preds[INTRA_MPM_COUNT] = {-1, -1, -1, -1, -1, -1}; + int8_t mpm_preds = -1; + uint32_t flag; + double bits = 0; /* if ((cur_cu->type == CU_INTRA && (LCU_WIDTH >> cur_cu->depth <= 32))) { @@ -839,8 +851,6 @@ static void encode_intra_coding_unit(encoder_state_t * const state, CABAC_BIN(cabac, 0, "bdpcm_mode"); } */ - - const int num_pred_units = uvg_part_mode_num_parts[cur_cu->part_size]; // Intra Subpartition mode uint32_t width = (LCU_WIDTH >> depth); @@ -878,15 +888,17 @@ static void encode_intra_coding_unit(encoder_state_t * const state, if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) { const int cu_width = LCU_WIDTH >> depth; const int cu_height = cu_width; // TODO: height for non-square blocks - uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, NULL, frame->cu_array); + uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, lcu, lcu ? NULL : frame->cu_array); // Write MIP flag - cabac->cur_ctx = &(cabac->ctx.mip_flag[ctx_id]); - CABAC_BIN(cabac, mip_flag, "mip_flag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mip_flag[ctx_id]), mip_flag, bits, "mip_flag"); if (mip_flag) { // Write MIP transpose flag & mode CABAC_BIN_EP(cabac, mip_transpose, "mip_transposed"); - uvg_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes); + if (cabac->only_count) bits += 1; + uvg_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes, bits_out); + if (cabac->only_count && bits_out) *bits_out += bits; + return; } } @@ -900,172 +912,155 @@ static void encode_intra_coding_unit(encoder_state_t * const state, if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) { if (MAX_REF_LINE_IDX > 1) { - cabac->cur_ctx = &(cabac->ctx.multi_ref_line[0]); - CABAC_BIN(cabac, multi_ref_idx != 0, "multi_ref_line"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.multi_ref_line[0]), multi_ref_idx != 0, bits, "multi_ref_line"); if (MAX_REF_LINE_IDX > 2 && multi_ref_idx != 0) { - cabac->cur_ctx = &(cabac->ctx.multi_ref_line[1]); - CABAC_BIN(cabac, multi_ref_idx != 1, "multi_ref_line") + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.multi_ref_line[1]), multi_ref_idx != 1, bits, "multi_ref_line"); } } } // ToDo: update real usage, these if clauses as such don't make any sense - if (isp_mode != 0 && multi_ref_idx == 0 && !mip_flag) { + if (isp_mode != 0 && multi_ref_idx == 0) { if (isp_mode) { - cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]); - CABAC_BIN(cabac, 0, "intra_subPartitions"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subPartitions"); } else { - cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]); - CABAC_BIN(cabac, 1, "intra_subPartitions"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions"); // ToDo: complete this if-clause if (isp_mode == 3) { - cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[1]); - CABAC_BIN(cabac, allow_isp - 1, "intra_subPart_ver_hor"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor"); } } } const int cu_width = LCU_WIDTH >> depth; - // If MIP is used, skip writing normal intra modes - if (!mip_flag) { // PREDINFO CODING // If intra prediction mode is found from the predictors, // it can be signaled with two EP's. Otherwise we can send // 5 EP bins with the full predmode // ToDo: fix comments for VVC - cabac->cur_ctx = &(cabac->ctx.intra_luma_mpm_flag_model); - for (int j = 0; j < num_pred_units; ++j) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j); - const cu_info_t* cur_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y); + const cu_info_t* cur_pu = cur_cu; // uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y); - const cu_info_t* left_pu = NULL; - const cu_info_t* above_pu = NULL; + const cu_info_t* left_pu = NULL; + const cu_info_t* above_pu = NULL; - if (pu_x > 0) { - assert(pu_x >> 2 > 0); - left_pu = uvg_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1); - } - // Don't take the above PU across the LCU boundary. - if (pu_y % LCU_WIDTH > 0 && pu_y > 0) { - assert(pu_y >> 2 > 0); - above_pu = uvg_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1); - } + if (x > 0) { + assert(x >> 2 > 0); + left_pu = lcu ? + LCU_GET_CU_AT_PX( + lcu, + SUB_SCU(x - 1), + SUB_SCU(y + cu_width - 1)) : + uvg_cu_array_at_const( + frame->cu_array, + x - 1, + y + cu_width - 1); + } + // Don't take the above PU across the LCU boundary. + if (y % LCU_WIDTH > 0 && y > 0) { + assert(y >> 2 > 0); + above_pu = lcu ? + LCU_GET_CU_AT_PX( + lcu, + SUB_SCU(x + cu_width - 1), + SUB_SCU(y -1)) : + uvg_cu_array_at_const( + frame->cu_array, + x + cu_width - 1, + y - 1); + } + + uvg_intra_get_dir_luma_predictor(x, y, + intra_preds, + cur_pu, + left_pu, above_pu); + intra_pred_mode_actual = cur_pu->intra.mode; - uvg_intra_get_dir_luma_predictor(pu_x, pu_y, - intra_preds[j], - cur_pu, - left_pu, above_pu); - - - intra_pred_mode_actual[j] = cur_pu->intra.mode; - - for (int i = 0; i < INTRA_MPM_COUNT; i++) { - if (intra_preds[j][i] == intra_pred_mode[j]) { - mpm_preds[j] = (int8_t)i; - break; - } - } - // Is the mode in the MPM array or not - flag[j] = (mpm_preds[j] == -1) ? 0 : 1; - if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) { - CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag"); - } + for (int i = 0; i < INTRA_MPM_COUNT; i++) { + if (intra_preds[i] == *intra_pred_mode) { + mpm_preds = (int8_t)i; + break; + } + } + // Is the mode in the MPM array or not + flag = (mpm_preds == -1) ? 0 : 1; + if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "prev_intra_luma_pred_flag"); + } + + // Signal index of the prediction mode in the prediction list, if it is there + if (flag) { + + const cu_info_t* cur_pu = cur_cu; + if (cur_pu->intra.multi_ref_idx == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]), (mpm_preds > 0 ? 1 : 0), bits, "mpm_idx_luma_planar"); } - for (int j = 0; j < num_pred_units; ++j) { - // TODO: this loop is unnecessary in VVC. Remove in future - assert(j == 0 && "In VVC this loop should be run only once."); + if (mpm_preds > 0) { + CABAC_BIN_EP(cabac, (mpm_preds > 1 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + if (mpm_preds > 1) { + CABAC_BIN_EP(cabac, (mpm_preds > 2 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + if (mpm_preds > 2) { + CABAC_BIN_EP(cabac, (mpm_preds > 3 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + if (mpm_preds > 3) { + CABAC_BIN_EP(cabac, (mpm_preds > 4 ? 1 : 0), "mpm_idx"); + if (cabac->only_count) bits += 1; + } + } + else { + // Signal the actual prediction mode. + int32_t tmp_pred = *intra_pred_mode; - // Signal index of the prediction mode in the prediction list, if it is there - if (flag[j]) { + uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2]; + memcpy(intra_preds_temp, intra_preds, sizeof(int8_t) * 3); + memcpy(intra_preds_temp + 4, &intra_preds[3], sizeof(int8_t) * 3); + intra_preds_temp[3] = 255; + intra_preds_temp[7] = 255; - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j); - const cu_info_t* cur_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y); - cabac->cur_ctx = &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]); - if (cur_pu->intra.multi_ref_idx == 0) { - CABAC_BIN(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx_luma_planar"); - } - //CABAC_BIN_EP(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx"); - if (mpm_preds[j] > 0) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 1 ? 1 : 0), "mpm_idx"); - } - if (mpm_preds[j] > 1) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 2 ? 1 : 0), "mpm_idx"); - } - if (mpm_preds[j] > 2) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 3 ? 1 : 0), "mpm_idx"); - } - if (mpm_preds[j] > 3) { - CABAC_BIN_EP(cabac, (mpm_preds[j] > 4 ? 1 : 0), "mpm_idx"); - } + // Improvised merge sort + // Sort prediction list from lowest to highest. + if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t); + if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t); + if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t); + + if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t); + if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t); + if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t); + + // Merge two subarrays + int32_t array1 = 0; + int32_t array2 = 4; + for (int item = 0; item < INTRA_MPM_COUNT; item++) { + if (intra_preds_temp[array1] < intra_preds_temp[array2]) { + intra_preds[item] = intra_preds_temp[array1]; + array1++; } else { - // Signal the actual prediction mode. - int32_t tmp_pred = intra_pred_mode[j]; - - uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2]; - memcpy(intra_preds_temp, intra_preds[j], sizeof(int8_t) * 3); - memcpy(intra_preds_temp + 4, &intra_preds[j][3], sizeof(int8_t) * 3); - intra_preds_temp[3] = 255; - intra_preds_temp[7] = 255; - - // Improvised merge sort - // Sort prediction list from lowest to highest. - if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t); - if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t); - if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t); - - if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t); - if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t); - if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t); - - // Merge two subarrays - int32_t array1 = 0; - int32_t array2 = 4; - for (int item = 0; item < INTRA_MPM_COUNT; item++) { - if (intra_preds_temp[array1] < intra_preds_temp[array2]) { - intra_preds[j][item] = intra_preds_temp[array1]; - array1++; - } - else { - intra_preds[j][item] = intra_preds_temp[array2]; - array2++; - } - } - - // Reduce the index of the signaled prediction mode according to the - // prediction list, as it has been already signaled that it's not one - // of the prediction modes. - for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) { - if (tmp_pred > intra_preds[j][i]) { - tmp_pred--; - } - } - - uvg_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT); + intra_preds[item] = intra_preds_temp[array2]; + array2++; } } - } - // Code chroma prediction mode. - if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); - } - - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); - - encode_mts_idx(state, cabac, cur_cu); - - if (state->encoder_control->chroma_format != UVG_CSP_400 && depth == 4 && x % 8 && y % 8) { - encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm); - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); - } + // Reduce the index of the signaled prediction mode according to the + // prediction list, as it has been already signaled that it's not one + // of the prediction modes. + for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) { + if (tmp_pred > intra_preds[i]) { + tmp_pred--; + } + } + uvg_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT, bits_out); + } + if (cabac->only_count && bits_out) *bits_out += bits; } /** @@ -1104,32 +1099,32 @@ static void encode_part_mode(encoder_state_t * const state, // log2CbSize == MinCbLog2SizeY | 0 1 2 bypass // log2CbSize > MinCbLog2SizeY | 0 1 3 bypass // ------------------------------+------------------ - + double bits = 0; if (cur_cu->type == CU_INTRA) { if (depth == MAX_DEPTH) { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); } else { - CABAC_BIN(cabac, 0, "part_mode NxN"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN"); } } } else { cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_BIN(cabac, 1, "part_mode 2Nx2N"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode split"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split"); cabac->cur_ctx = &(cabac->ctx.part_size_model[1]); if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_2NxnD) { - CABAC_BIN(cabac, 1, "part_mode vertical"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical"); } else { - CABAC_BIN(cabac, 0, "part_mode horizontal"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal"); } if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) { @@ -1137,22 +1132,134 @@ static void encode_part_mode(encoder_state_t * const state, if (cur_cu->part_size == SIZE_2NxN || cur_cu->part_size == SIZE_Nx2N) { - CABAC_BIN(cabac, 1, "part_mode SMP"); - return; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP"); + return bits; } - CABAC_BIN(cabac, 0, "part_mode AMP"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP"); if (cur_cu->part_size == SIZE_2NxnU || cur_cu->part_size == SIZE_nLx2N) { CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } else { CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP"); + if(cabac->only_count) bits += 1; } } } + return bits; } **/ + +bool uvg_write_split_flag(const encoder_state_t * const state, cabac_data_t* cabac, + const cu_info_t * left_cu, const cu_info_t * above_cu, + uint8_t split_flag, + int depth, int cu_width, int x, int y, double* bits_out) +{ + uint16_t abs_x = x + state->tile->offset_x; + uint16_t abs_y = y + state->tile->offset_y; + double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; + // Implisit split flag when on border + // Exception made in VVC with flag not being implicit if the BT can be used for + // horizontal or vertical split, then this flag tells if QT or BT is used + + bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split; + no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true; + if (depth > MAX_DEPTH) allow_qt = false; + // ToDo: update this when btt is actually used + bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH + + + uint8_t implicit_split_mode = UVG_NO_SPLIT; + //bool implicit_split = border; + bool bottom_left_available = ((abs_y + cu_width - 1) < ctrl->in.height); + bool top_right_available = ((abs_x + cu_width - 1) < ctrl->in.width); + + if (!bottom_left_available && !top_right_available && allow_qt) { + implicit_split_mode = UVG_QUAD_SPLIT; + } + else if (!bottom_left_available && allow_btt) { + implicit_split_mode = UVG_HORZ_SPLIT; + } + else if (!top_right_available && allow_btt) { + implicit_split_mode = UVG_VERT_SPLIT; + } + else if (!bottom_left_available || !top_right_available) { + implicit_split_mode = UVG_QUAD_SPLIT; + } + + // Check split conditions + if (implicit_split_mode != UVG_NO_SPLIT) { + no_split = th_split = tv_split = false; + bh_split = (implicit_split_mode == UVG_HORZ_SPLIT); + bv_split = (implicit_split_mode == UVG_VERT_SPLIT); + } + + if (!allow_btt) { + bh_split = bv_split = th_split = tv_split = false; + } + + bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split; + + split_flag |= implicit_split_mode != UVG_NO_SPLIT; + + int split_model = 0; + if (no_split && allow_split) { + // Get left and top block split_flags and if they are present and true, increase model number + // ToDo: should use height and width to increase model, PU_GET_W() ? + if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) { + split_model++; + } + + if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) { + split_model++; + } + + uint32_t split_num = 0; + if (allow_qt) split_num += 2; + if (bh_split) split_num++; + if (bv_split) split_num++; + if (th_split) split_num++; + if (tv_split) split_num++; + + if (split_num > 0) split_num--; + + split_model += 3 * (split_num >> 1); + + cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag, bits, "split_flag"); + } + + bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT; + + if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) { + split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag"); + } + + // Only signal split when it is not implicit, currently only Qt split supported + if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) { + + split_model = 0; + + // Get left and top block split_flags and if they are present and true, increase model number + if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { + split_model++; + } + + if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { + split_model++; + } + + split_model += (depth > 2 ? 0 : 3); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), split_flag, bits, "split_cu_mode"); + } + if (bits_out) *bits_out += bits; + return split_flag; +} + void uvg_encode_coding_tree(encoder_state_t * const state, uint16_t x, uint16_t y, @@ -1176,8 +1283,6 @@ void uvg_encode_coding_tree(encoder_state_t * const state, above_cu = uvg_cu_array_at_const((const cu_array_t*)frame->cu_array, x, y - 1); } - uint8_t split_flag = GET_SPLITDATA(cur_cu, depth); - uint8_t split_model = 0; // Absolute coordinates uint16_t abs_x = x + state->tile->offset_x; @@ -1190,123 +1295,15 @@ void uvg_encode_coding_tree(encoder_state_t * const state, bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu; bool border = border_x || border_y; /*!< are we in any border CU */ - if (depth <= ctrl->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { state->must_code_qp_delta = true; } // When not in MAX_DEPTH, insert split flag and split the blocks if needed if (depth != MAX_DEPTH) { - // Implisit split flag when on border - // Exception made in VVC with flag not being implicit if the BT can be used for - // horizontal or vertical split, then this flag tells if QT or BT is used - - bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split; - no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true; - if(depth > MAX_DEPTH) allow_qt = false; - // ToDo: update this when btt is actually used - bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH - + const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, NULL); - - uint8_t implicit_split_mode = UVG_NO_SPLIT; - //bool implicit_split = border; - bool bottom_left_available = ((abs_y + cu_width - 1) < ctrl->in.height); - bool top_right_available = ((abs_x + cu_width - 1) < ctrl->in.width); - - /* - if((depth >= 1 && (border_x != border_y))) implicit_split = false; - if (state->frame->slicetype != UVG_SLICE_I) { - if (border_x != border_y) implicit_split = false; - if (!bottom_left_available && top_right_available) implicit_split = false; - if (!top_right_available && bottom_left_available) implicit_split = false; - } - */ - - - if (!bottom_left_available && !top_right_available && allow_qt) { - implicit_split_mode = UVG_QUAD_SPLIT; - } else if (!bottom_left_available && allow_btt) { - implicit_split_mode = UVG_HORZ_SPLIT; - } else if (!top_right_available && allow_btt) { - implicit_split_mode = UVG_VERT_SPLIT; - } else if (!bottom_left_available || !top_right_available) { - implicit_split_mode = UVG_QUAD_SPLIT; - } - - //split_flag = implicit_split_mode != UVG_NO_SPLIT; - - // Check split conditions - if (implicit_split_mode != UVG_NO_SPLIT) { - no_split = th_split = tv_split = false; - bh_split = (implicit_split_mode == UVG_HORZ_SPLIT); - bv_split = (implicit_split_mode == UVG_VERT_SPLIT); - } - - if (!allow_btt) { - bh_split = bv_split = th_split = tv_split = false; - } - - bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split; - - split_flag |= implicit_split_mode != UVG_NO_SPLIT; - - if (no_split && allow_split) { - split_model = 0; - - // Get left and top block split_flags and if they are present and true, increase model number - // ToDo: should use height and width to increase model, PU_GET_W() ? - if (left_cu && PU_GET_H(left_cu->part_size,LCU_WIDTH>>left_cu->depth,0) < LCU_WIDTH>>depth) { - split_model++; - } - - if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) { - split_model++; - } - - uint32_t split_num = 0; - if (allow_qt) split_num+=2; - if (bh_split) split_num++; - if (bv_split) split_num++; - if (th_split) split_num++; - if (tv_split) split_num++; - - if (split_num > 0) split_num--; - - split_model += 3 * (split_num >> 1); - - cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]); - CABAC_BIN(cabac, split_flag, "SplitFlag"); - //fprintf(stdout, "split_model=%d %d / %d / %d / %d / %d\n", split_model, allow_qt, bh_split, bv_split, th_split, tv_split); - } - - bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT; - - if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) { - split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3); - cabac->cur_ctx = &(cabac->ctx.qt_split_flag_model[split_model]); - CABAC_BIN(cabac, qt_split, "QT_SplitFlag"); - } - - // Only signal split when it is not implicit, currently only Qt split supported - if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) { - - split_model = 0; - - // Get left and top block split_flags and if they are present and true, increase model number - if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { - split_model++; - } - - if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { - split_model++; - } - split_model += (depth > 2 ? 0 : 3); - - cabac->cur_ctx = &(cabac->ctx.qt_split_flag_model[split_model]); - CABAC_BIN(cabac, split_flag, "split_cu_mode"); - } - if (split_flag || border) { // Split blocks and remember to change x and y block positions uvg_encode_coding_tree(state, x, y, depth + 1, coeff); @@ -1455,7 +1452,7 @@ void uvg_encode_coding_tree(encoder_state_t * const state, const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); const cu_info_t *cur_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y); - non_zero_mvd |= encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth); + non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL); DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu); uvg_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu); } @@ -1493,7 +1490,22 @@ void uvg_encode_coding_tree(encoder_state_t * const state, } } else if (cur_cu->type == CU_INTRA) { - encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, coeff); + uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); + + // Code chroma prediction mode. + if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4) { + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); + } + + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff); + + encode_mts_idx(state, cabac, cur_cu); + + // For 4x4 the chroma PU/TU is coded after the last + if (state->encoder_control->chroma_format != UVG_CSP_400 && depth == 4 && x % 8 && y % 8) { + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); + encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff); + } } else { @@ -1510,11 +1522,111 @@ end: } +double uvg_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu, cu_info_t* cur_cu) { + double bits = 0; + const encoder_control_t* const ctrl = state->encoder_control; + + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + + const int cu_width = LCU_WIDTH >> depth; + + const cu_info_t* left_cu = NULL, *above_cu = NULL; + if (x) { + left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + if (y) { + above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1); + } + + if (depth <= state->frame->max_qp_delta_depth) { + state->must_code_qp_delta = true; + } + + // When not in MAX_DEPTH, insert split flag and split the blocks if needed + if (depth != MAX_DEPTH) { + uvg_write_split_flag(state, cabac, left_cu, above_cu, 0, depth, cu_width, x, y, &bits); + } + + // Encode skip flag + if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { + int8_t ctx_skip = 0; + + if (left_cu && left_cu->skipped) { + ctx_skip++; + } + if (above_cu && above_cu->skipped) { + ctx_skip++; + } + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag"); + + if (cur_cu->skipped) { + int16_t num_cand = state->encoder_control->cfg.max_merge; + if (num_cand > 1) { + for (int ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != cur_cu->merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); + } + else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + if(cabac->only_count) bits += 1; + } + if (symbol == 0) { + break; + } + } + } + return bits; + } + } + // Prediction mode + if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { + + int8_t ctx_predmode = 0; + + if ((left_cu && left_cu->type == CU_INTRA) || (above_cu && above_cu->type == CU_INTRA)) { + ctx_predmode = 1; + } + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model[ctx_predmode]), (cur_cu->type == CU_INTRA), bits, "PredMode"); + } + + if (cur_cu->type == CU_INTER) { + const uint8_t imv_mode = UVG_IMV_OFF; + const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits); + if (ctrl->cfg.amvr && non_zero_mvd) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag"); + if (imv_mode > UVG_IMV_OFF) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[4]), imv_mode, bits, "imv_flag"); + if (imv_mode < UVG_IMV_HPEL) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[1]), imv_mode, bits, "imv_flag"); // 1 indicates 4PEL, 0 FPEL + } + } + } + } + else if (cur_cu->type == CU_INTRA) { + uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); + if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400) { + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm); + } + } + else { + assert(0 && "Unset cu type"); + } + return bits; +} + void uvg_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver) + int32_t mvd_ver, double* bits_out) { const int8_t hor_abs_gr0 = mvd_hor != 0; const int8_t ver_abs_gr0 = mvd_ver != 0; @@ -1522,29 +1634,33 @@ void uvg_encode_mvd(encoder_state_t * const state, const uint32_t mvd_ver_abs = abs(mvd_ver); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0]; - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver"); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1]; if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor"); } if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver"); } if (hor_abs_gr0) { if (mvd_hor_abs > 1) { - uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + uint32_t bits = uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + if(cabac->only_count) *bits_out += bits; } uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1; CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); + if (cabac->only_count) *bits_out += 1; } if (ver_abs_gr0) { if (mvd_ver_abs > 1) { - uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + uint32_t bits = uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + if (cabac->only_count) *bits_out += bits; } uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1; CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); + if (cabac->only_count) *bits_out += 1; } } diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 8141d19b..92e46e04 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -56,7 +56,33 @@ void uvg_encode_ts_residual(encoder_state_t* const state, void uvg_encode_mvd(encoder_state_t * const state, cabac_data_t *cabac, int32_t mvd_hor, - int32_t mvd_ver); + int32_t mvd_ver, + double* bits_out); + +double uvg_mock_encode_coding_unit( + encoder_state_t* const state, + cabac_data_t* cabac, + int x, int y, int depth, + lcu_t* lcu, cu_info_t* cur_cu); + +int uvg_encode_inter_prediction_unit(encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int width, int height, + int depth, + lcu_t* lcu, + double* bits_out); + +void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int x, int y, int depth, const lcu_t* lcu, double* bits_out); + + +bool uvg_write_split_flag(const encoder_state_t* const state, cabac_data_t* cabac, + const cu_info_t* left_cu, const cu_info_t* above_cu, + uint8_t split_flag, + int depth, int cu_width, int x, int y, double* bits_out); void uvg_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, diff --git a/src/encoder.c b/src/encoder.c index daaa717e..86259ad9 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -32,7 +32,6 @@ #include "encoder.h" -// This define is required for M_PI on Windows. #define _USE_MATH_DEFINES #include #include @@ -45,14 +44,6 @@ #include "uvg_math.h" #include "fast_coeff_cost.h" -/** - * \brief Strength of QP adjustments when using adaptive QP for 360 video. - * - * Determined empirically. - */ -static const double ERP_AQP_STRENGTH = 3.0; - - static int encoder_control_init_gop_layer_weights(encoder_control_t * const); static unsigned cfg_num_threads(void) @@ -136,22 +127,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder) } -/** - * \brief Return weight for 360 degree ERP video - * - * Returns the scaling factor of area from equirectangular projection to - * spherical surface. - * - * \param y y-coordinate of the pixel - * \param h height of the picture - */ -static double ws_weight(int y, int h) -{ - return cos((y - 0.5 * h + 0.5) * (M_PI / h)); -} - - - /** * \brief Update ROI QPs for 360 video with equirectangular projection. * @@ -162,55 +137,6 @@ static double ws_weight(int y, int h) * \param orig_width width of orig_roi * \param orig_height height of orig_roi */ -static void init_erp_aqp_roi(encoder_control_t* encoder, - int8_t *orig_roi, - int32_t orig_width, - int32_t orig_height) -{ - // Update ROI with WS-PSNR delta QPs. - int height = encoder->in.height_in_lcu; - int width = orig_roi ? orig_width : 1; - - int frame_height = encoder->in.real_height; - - encoder->cfg.roi.width = width; - encoder->cfg.roi.height = height; - encoder->cfg.roi.dqps = calloc(width * height, sizeof(orig_roi[0])); - - double total_weight = 0.0; - for (int y = 0; y < frame_height; y++) { - total_weight += ws_weight(y, frame_height); - } - - for (int y_lcu = 0; y_lcu < height; y_lcu++) { - int y_orig = LCU_WIDTH * y_lcu; - int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); - - double lcu_weight = 0.0; - for (int y = y_orig; y < y_orig + lcu_height; y++) { - lcu_weight += ws_weight(y, frame_height); - } - // Normalize. - lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); - - int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); - - if (orig_roi) { - // If a ROI array already exists, we copy the existing values to the - // new array while adding qp_delta to each. - int y_roi = y_lcu * orig_height / height; - for (int x = 0; x < width; x++) { - encoder->cfg.roi.dqps[x + y_lcu * width] = - CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta); - } - - } else { - // Otherwise, simply write qp_delta to the ROI array. - encoder->cfg.roi.dqps[y_lcu] = qp_delta; - } - } -} - static int8_t* derive_chroma_QP_mapping_table(const uvg_config* const cfg, int i) { @@ -394,6 +320,16 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg) encoder->scaling_list.use_default_list = 1; } + // ROI / delta QP + if (cfg->roi.file_path) { + const char *mode[2] = { "r", "rb" }; + encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]); + if (!encoder->roi_file) { + fprintf(stderr, "Could not open ROI file.\n"); + goto init_failed; + } + } + if (cfg->fast_coeff_table_fn) { FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb"); if (fast_coeff_table_f == NULL) { @@ -435,32 +371,10 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg) goto init_failed; } - if (cfg->erp_aqp) { - init_erp_aqp_roi(encoder, - cfg->roi.dqps, - cfg->roi.width, - cfg->roi.height); - - } else if (cfg->roi.dqps) { - // Copy delta QP array for ROI coding. - const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height; - encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0])); - memcpy(encoder->cfg.roi.dqps, - cfg->roi.dqps, - roi_size * sizeof(*cfg->roi.dqps)); - - } - // NOTE: When tr_depth_inter is equal to 0, the transform is still split // for SMP and AMP partition units. encoder->tr_depth_inter = 0; - if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) { - encoder->max_qp_delta_depth = 0; - } else { - encoder->max_qp_delta_depth = -1; - } - //Tiles encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 || encoder->cfg.tiles_height_count > 1; @@ -761,7 +675,7 @@ void uvg_encoder_control_free(encoder_control_t *const encoder) FREE_POINTER(encoder->tiles_tile_id); - FREE_POINTER(encoder->cfg.roi.dqps); + FREE_POINTER(encoder->cfg.roi.file_path); uvg_scalinglist_destroy(&encoder->scaling_list); @@ -773,6 +687,10 @@ void uvg_encoder_control_free(encoder_control_t *const encoder) uvg_close_rdcost_outfiles(); + if (encoder->roi_file) { + fclose(encoder->roi_file); + } + free(encoder); } diff --git a/src/encoder.h b/src/encoder.h index 86bf2529..02dc26b7 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -130,7 +130,7 @@ typedef struct encoder_control_t //! Picture weights when GOP is used. double gop_layer_weights[MAX_GOP_LAYERS]; - int8_t max_qp_delta_depth; + FILE *roi_file; int tr_depth_inter; diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index ae346526..402ec559 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -805,7 +805,7 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream, WRITE_U(stream, 0, 1, "pps_ref_wraparound_enabled_flag"); WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pps_init_qp_minus26"); - WRITE_U(stream, encoder->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag"); + WRITE_U(stream, state->frame->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag"); WRITE_U(stream, 0,1, "pps_chroma_tool_offsets_present_flag"); /* // If chroma_tool_offsets_present @@ -1037,8 +1037,8 @@ static void uvg_encoder_state_write_bitstream_picture_header( const int poc_lsb = state->frame->poc & ((1 << encoder->poc_lsb_bits) - 1); WRITE_U(stream, poc_lsb, encoder->poc_lsb_bits, "ph_pic_order_cnt_lsb"); - if (encoder->max_qp_delta_depth >= 0) { - WRITE_UE(stream, encoder->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_intra_slice"); + if (state->frame->max_qp_delta_depth >= 0) { + WRITE_UE(stream, state->frame->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_intra_slice"); } // alf enable flags and aps IDs @@ -1118,8 +1118,8 @@ static void uvg_encoder_state_write_bitstream_picture_header( || state->frame->pictype == UVG_NAL_IDR_N_LP) { } else { - if (encoder->max_qp_delta_depth >= 0) { - WRITE_UE(stream, encoder->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_inter_slice"); + if (state->frame->max_qp_delta_depth >= 0) { + WRITE_UE(stream, state->frame->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_inter_slice"); } if (state->encoder_control->cfg.tmvp_enable) { WRITE_U(stream, state->encoder_control->cfg.tmvp_enable, 1, "ph_pic_temporal_mvp_enabled_flag"); @@ -1128,7 +1128,7 @@ static void uvg_encoder_state_write_bitstream_picture_header( } if (encoder->cfg.jccr) { - WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag"); + WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag"); } // END PICTURE HEADER diff --git a/src/encoderstate.c b/src/encoderstate.c index 32d86d65..5a99e588 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -32,6 +32,9 @@ #include "encoderstate.h" + // This define is required for M_PI on Windows. +#define _USE_MATH_DEFINES +#include #include #include #include @@ -53,6 +56,12 @@ #include "strategies/strategies-picture.h" +/** + * \brief Strength of QP adjustments when using adaptive QP for 360 video. + * + * Determined empirically. + */ +static const double ERP_AQP_STRENGTH = 3.0; int uvg_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) { int i; @@ -572,7 +581,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y); const int cu_width = LCU_WIDTH >> depth; - if (depth <= state->encoder_control->max_qp_delta_depth) { + if (depth <= state->frame->max_qp_delta_depth) { *prev_qp = -1; } @@ -624,6 +633,38 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las } } + +static void set_joint_cb_cr_modes(encoder_state_t* state, uvg_picture* pic) +{ + bool sgnFlag = true; + + if (state->encoder_control->chroma_format != UVG_CSP_400) + { + const int x1 = pic->width / 2 - 1; + const int y1 = pic->height / 2 - 1; + const int cbs = pic->stride / 2; + const int crs = pic->stride / 2; + const uvg_pixel* p_cb = pic->u + 1 * cbs; + const uvg_pixel* p_cr = pic->v + 1 * crs; + int64_t sum_cb_cr = 0; + + // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes + for (int y = 1; y < y1; y++, p_cb += cbs, p_cr += crs) + { + for (int x = 1; x < x1; x++) + { + int cb = (12 * (int)p_cb[x] - 2 * ((int)p_cb[x - 1] + (int)p_cb[x + 1] + (int)p_cb[x - cbs] + (int)p_cb[x + cbs]) - ((int)p_cb[x - 1 - cbs] + (int)p_cb[x + 1 - cbs] + (int)p_cb[x - 1 + cbs] + (int)p_cb[x + 1 + cbs])); + int cr = (12 * (int)p_cr[x] - 2 * ((int)p_cr[x - 1] + (int)p_cr[x + 1] + (int)p_cr[x - crs] + (int)p_cr[x + crs]) - ((int)p_cr[x - 1 - crs] + (int)p_cr[x + 1 - crs] + (int)p_cr[x - 1 + crs] + (int)p_cr[x + 1 + crs])); + sum_cb_cr += cb * cr; + } + } + + sgnFlag = (sum_cb_cr < 0); + } + + state->frame->jccr_sign = sgnFlag; +} + static void encoder_state_worker_encode_lcu_bitstream(void* opaque); static void encoder_state_worker_encode_lcu_search(void * opaque) @@ -665,7 +706,7 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search); - if (encoder->max_qp_delta_depth >= 0) { + if (state->frame->max_qp_delta_depth >= 0) { int last_qp = state->last_qp; int prev_qp = -1; set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp); @@ -716,6 +757,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) const uint64_t existing_bits = uvg_bitstream_tell(&state->stream); //Encode SAO + state->cabac.update = 1; if (encoder->cfg.sao_type) { encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]); } @@ -771,6 +813,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) uvg_cabac_start(&state->cabac); } } + state->cabac.update = 0; pthread_mutex_lock(&state->frame->rc_lock); @@ -1421,6 +1464,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64) } } + +/** + * \brief Return weight for 360 degree ERP video + * + * Returns the scaling factor of area from equirectangular projection to + * spherical surface. + * + * \param y y-coordinate of the pixel + * \param h height of the picture + */ +static double ws_weight(int y, int h) +{ + return cos((y - 0.5 * h + 0.5) * (M_PI / h)); +} + + +/** + * \brief Update ROI QPs for 360 video with equirectangular projection. + * + * Updates the ROI parameters in frame->roi. + * + * \param encoder encoder control + * \param frame frame that will have the ROI map + */ +static void init_erp_aqp_roi(const encoder_control_t *encoder, uvg_picture *frame) +{ + int8_t *orig_roi = frame->roi.roi_array; + int32_t orig_width = frame->roi.width; + int32_t orig_height = frame->roi.height; + + // Update ROI with WS-PSNR delta QPs. + int new_height = encoder->in.height_in_lcu; + int new_width = orig_roi ? orig_width : 1; + int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0])); + + int frame_height = encoder->in.real_height; + + double total_weight = 0.0; + for (int y = 0; y < frame_height; y++) { + total_weight += ws_weight(y, frame_height); + } + + for (int y_lcu = 0; y_lcu < new_height; y_lcu++) { + int y_orig = LCU_WIDTH * y_lcu; + int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig); + + double lcu_weight = 0.0; + for (int y = y_orig; y < y_orig + lcu_height; y++) { + lcu_weight += ws_weight(y, frame_height); + } + // Normalize. + lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height); + + int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight)); + + if (orig_roi) { + // If a ROI array already exists, we copy the existing values to the + // new array while adding qp_delta to each. + int y_roi = y_lcu * orig_height / new_height; + for (int x = 0; x < new_width; x++) { + new_array[x + y_lcu * new_width] = + CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta); + } + + } else { + // Otherwise, simply write qp_delta to the ROI array. + new_array[y_lcu] = qp_delta; + } + } + + // Update new values + frame->roi.width = new_width; + frame->roi.height = new_height; + frame->roi.roi_array = new_array; + FREE_POINTER(orig_roi); +} + + +static void next_roi_frame_from_file(uvg_picture *frame, FILE *file, enum uvg_roi_format format) { + // The ROI description is as follows: + // First number is width, second number is height, + // then follows width * height number of dqp values. + + // Rewind the (seekable) ROI file when end of file is reached. + // Allows a single ROI frame to be used for a whole sequence + // and looping with --loop-input. Skips possible whitespace. + if (ftell(file) != -1L) { + int c = fgetc(file); + while (format == UVG_ROI_TXT && isspace(c)) c = fgetc(file); + ungetc(c, file); + if (c == EOF) rewind(file); + } + + int *width = &frame->roi.width; + int *height = &frame->roi.height; + + bool failed = false; + + if (format == UVG_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height); + if (format == UVG_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2; + + if (failed) { + fprintf(stderr, "Failed to read ROI size.\n"); + fclose(file); + assert(0); + } + + if (*width <= 0 || *height <= 0) { + fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height); + fclose(file); + assert(0); + } + + if (*width > 10000 || *height > 10000) { + fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); + fclose(file); + assert(0); + } + + const unsigned size = (*width) * (*height); + int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0])); + if (!dqp_array) { + fprintf(stderr, "Failed to allocate memory for ROI table.\n"); + fclose(file); + assert(0); + } + + FREE_POINTER(frame->roi.roi_array); + frame->roi.roi_array = dqp_array; + + if (format == UVG_ROI_TXT) { + for (int i = 0; i < size; ++i) { + int number; // Need a pointer to int for fscanf + if (fscanf(file, "%d", &number) != 1) { + fprintf(stderr, "Reading ROI file failed.\n"); + fclose(file); + assert(0); + } + dqp_array[i] = CLIP(-51, 51, number); + } + } else if (format == UVG_ROI_BIN) { + if (fread(dqp_array, 1, size, file) != size) { + fprintf(stderr, "Reading ROI file failed.\n"); + assert(0); + } + } +} + static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_picture* frame) { assert(state->type == ENCODER_STATE_TYPE_MAIN); @@ -1437,6 +1628,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); } + // ROI / delta QP maps + if (frame->roi.roi_array && cfg->roi.file_path) { + assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified."); + } + + // Read frame from the file. If no file is specified, + // ROI data should be already set by the application. + if (cfg->roi.file_path) { + next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format); + } + + if (cfg->erp_aqp) { + init_erp_aqp_roi(state->encoder_control, state->tile->frame->source); + } + // Variance adaptive quantization if (cfg->vaq) { const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; @@ -1523,6 +1729,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict } // Variance adaptive quantization - END + if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) { + state->frame->max_qp_delta_depth = 0; + } else { + state->frame->max_qp_delta_depth = -1; + } + // Use this flag to handle closed gop irap picture selection. // If set to true, irap is already set and we avoid // setting it based on the intra period @@ -1689,6 +1901,7 @@ void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame) encoder_state_init_new_frame(state, frame); + if(state->encoder_control->cfg.jccr) set_joint_cb_cr_modes(state, frame); // Create a separate job for ALF done after everything else, and only then do final bitstream writing (for ALF parameters) if (state->encoder_control->cfg.alf_type && state->encoder_control->cfg.wpp) { @@ -1834,10 +2047,9 @@ lcu_stats_t* uvg_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y) int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp) { - const encoder_control_t *ctrl = state->encoder_control; const cu_array_t *cua = state->tile->frame->cu_array; // Quantization group width - const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth); + const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth); // Coordinates of the top-left corner of the quantization group const int x_qg = x & ~(qg_width - 1); diff --git a/src/encoderstate.h b/src/encoderstate.h index 620af515..40e1dc24 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -179,6 +179,8 @@ typedef struct encoder_state_config_frame_t { */ double *aq_offsets; + int8_t max_qp_delta_depth; + /** * \brief Whether next NAL is the first NAL in the access unit. */ @@ -193,6 +195,7 @@ typedef struct encoder_state_config_frame_t { cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row uint8_t* hmvp_size; //!< \brief HMVP LUT size + bool jccr_sign; } encoder_state_config_frame_t; @@ -320,6 +323,7 @@ typedef struct encoder_state_t { bitstream_t stream; cabac_data_t cabac; + cabac_data_t search_cabac; uint32_t stats_bitstream_length; //Bitstream length written in bytes @@ -402,10 +406,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state) */ static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth) { - if (state->encoder_control->max_qp_delta_depth < 0) return false; + if (state->frame->max_qp_delta_depth < 0) return false; const int cu_width = LCU_WIDTH >> depth; - const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth; + const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth; const int right = x + cu_width; const int bottom = y + cu_width; return (right % qg_width == 0 || right >= state->tile->frame->width) && diff --git a/src/fast_coeff_cost.c b/src/fast_coeff_cost.c index f077ec21..d708fbfd 100644 --- a/src/fast_coeff_cost.c +++ b/src/fast_coeff_cost.c @@ -40,7 +40,7 @@ static uint16_t to_q88(float f) return (uint16_t)(f * 256.0f + 0.5f); } -static uint64_t to_4xq88(const float f[4]) +static uint64_t to_4xq88(const double f[4]) { int i; uint64_t result = 0; @@ -58,9 +58,9 @@ int uvg_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_ uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp; for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) { - float curr_wts[4]; + double curr_wts[4]; - if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0, + if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0, curr_wts + 1, curr_wts + 2, curr_wts + 3) != 4) { diff --git a/src/fast_coeff_cost.h b/src/fast_coeff_cost.h index 0639a34c..5c53fdf1 100644 --- a/src/fast_coeff_cost.h +++ b/src/fast_coeff_cost.h @@ -45,7 +45,7 @@ typedef struct { // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from // 0 to MAX_FAST_COEFF_COST_QP -static const float default_fast_coeff_cost_wts[][4] = { +static const double default_fast_coeff_cost_wts[][4] = { // Just extend it by stretching the first actual values.. {0.164240f, 4.161530f, 3.509033f, 6.928047f}, {0.164240f, 4.161530f, 3.509033f, 6.928047f}, diff --git a/src/filter.c b/src/filter.c index 656b7889..1641109d 100644 --- a/src/filter.c +++ b/src/filter.c @@ -339,7 +339,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir) static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir) { - if (state->encoder_control->max_qp_delta_depth < 0) { + if (state->frame->max_qp_delta_depth < 0) { return state->qp; } diff --git a/src/image.c b/src/image.c index 48a1e958..ff960f26 100644 --- a/src/image.c +++ b/src/image.c @@ -106,6 +106,10 @@ uvg_picture * uvg_image_alloc(enum uvg_chroma_format chroma_format, const int32_ im->interlacing = UVG_INTERLACING_NONE; + im->roi.roi_array = NULL; + im->roi.width = 0; + im->roi.height = 0; + return im; } @@ -132,6 +136,7 @@ void uvg_image_free(uvg_picture *const im) uvg_image_free(im->base_image); } else { free(im->fulldata_buf); + if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array); } // Make sure freed data won't be used. @@ -192,6 +197,8 @@ uvg_picture *uvg_image_make_subimage(uvg_picture *const orig_image, im->pts = 0; im->dts = 0; + im->roi = orig_image->roi; + return im; } diff --git a/src/inter.c b/src/inter.c index d28d7002..7333a3cf 100644 --- a/src/inter.c +++ b/src/inter.c @@ -624,7 +624,9 @@ void uvg_inter_pred_pu(const encoder_state_t * const state, int i_pu) { - cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + const int x_scu = SUB_SCU(x); + const int y_scu = SUB_SCU(y); + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu); const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu); const int pu_w = PU_GET_W(cu->part_size, width, i_pu); @@ -673,6 +675,12 @@ void uvg_inter_pred_pu(const encoder_state_t * const state, NULL, predict_luma, predict_chroma); } + + if (predict_chroma && state->encoder_control->cfg.jccr) { + const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + } } /** @@ -1290,7 +1298,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, int32_t width, int32_t height, const merge_candidates_t *merge_cand, - const cu_info_t *cur_cu, + const cu_info_t * const cur_cu, int8_t reflist, mv_t mv_cand[2][2]) { @@ -1396,7 +1404,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, mv_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t * const cur_cu, lcu_t *lcu, int8_t reflist) { diff --git a/src/inter.h b/src/inter.h index 3d3ae797..45f5e5ea 100644 --- a/src/inter.h +++ b/src/inter.h @@ -96,7 +96,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, int32_t width, int32_t height, mv_t mv_cand[2][2], - cu_info_t* cur_cu, + const cu_info_t* cur_cu, lcu_t *lcu, int8_t reflist); diff --git a/src/intra.c b/src/intra.c index 8f1d9aab..97702498 100644 --- a/src/intra.c +++ b/src/intra.c @@ -82,6 +82,17 @@ static const uint8_t num_ref_pixels_left[16][16] = { { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 } }; + +static void mip_predict( + const encoder_state_t* const state, + const uvg_intra_references* const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + uvg_pixel* dst, + const int mip_mode, + const bool mip_transp); + + int8_t uvg_intra_get_dir_luma_predictor( const uint32_t x, const uint32_t y, @@ -452,7 +463,7 @@ static void get_cclm_parameters( } } -static void linear_transform_cclm(cclm_parameters_t* cclm_params, uvg_pixel * src, uvg_pixel * dst, int stride, int height) { +static void linear_transform_cclm(const cclm_parameters_t* cclm_params, uvg_pixel * src, uvg_pixel * dst, int stride, int height) { int scale = cclm_params->a; int shift = cclm_params->shift; int offset = cclm_params->b; @@ -468,7 +479,7 @@ static void linear_transform_cclm(cclm_parameters_t* cclm_params, uvg_pixel * sr } -void uvg_predict_cclm( +static void predict_cclm( encoder_state_t const* const state, const color_t color, const int8_t width, @@ -477,7 +488,7 @@ void uvg_predict_cclm( const int16_t y0, const int16_t stride, const int8_t mode, - lcu_t* const lcu, + const lcu_t* const lcu, uvg_intra_references* chroma_ref, uvg_pixel* dst, cclm_parameters_t* cclm_params @@ -498,6 +509,7 @@ void uvg_predict_cclm( uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH; + const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); // Essentially what this does is that it uses 6-tap filtering to downsample // the luma intra references down to match the resolution of the chroma channel. @@ -508,12 +520,12 @@ void uvg_predict_cclm( if (y0) { for (; available_above_right < width / 2; available_above_right++) { int x_extension = x_scu + width * 2 + 4 * available_above_right; - cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4); + const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4); if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break; } if(y_scu == 0) { if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4); - memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(uvg_pixel) * (width + available_above_right * 2)); + memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride2 / 2)], sizeof(uvg_pixel) * (width + available_above_right * 2)); } else { for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) { @@ -533,16 +545,16 @@ void uvg_predict_cclm( if(x0) { for (; available_left_below < height / 2; available_left_below++) { int y_extension = y_scu + height * 2 + 4 * available_left_below; - cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension); + const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension); if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break; if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break; } for(int i = 0; i < height + available_left_below * 2; i++) { - sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1]; + sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1]; } } - uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width); + uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride2) / 4], sampled_luma, width, height, stride2 / 2, width); int16_t a, b, shift; get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift); @@ -727,12 +739,17 @@ void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* } + /** \brief Matrix weighted intra prediction. */ -void uvg_mip_predict(encoder_state_t const* const state, uvg_intra_references* const refs, - const uint16_t pred_block_width, const uint16_t pred_block_height, - uvg_pixel* dst, - const int mip_mode, const bool mip_transp) +static void mip_predict( + const encoder_state_t* const state, + const uvg_intra_references* const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + uvg_pixel* dst, + const int mip_mode, + const bool mip_transp) { // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative @@ -875,14 +892,13 @@ void uvg_mip_predict(encoder_state_t const* const state, uvg_intra_references* c } -void uvg_intra_predict( - encoder_state_t *const state, +static void intra_predict_regular( + const encoder_state_t* const state, uvg_intra_references *refs, int_fast8_t log2_width, int_fast8_t mode, color_t color, uvg_pixel *dst, - bool filter_boundary, const uint8_t multi_ref_idx) { const int_fast8_t width = 1 << log2_width; @@ -1350,18 +1366,66 @@ void uvg_intra_build_reference( } } + +void uvg_intra_predict( + const encoder_state_t* const state, + uvg_intra_references* const refs, + const cu_loc_t* const cu_loc, + const color_t color, + uvg_pixel* dst, + const intra_search_data_t* data, + const lcu_t* lcu + ) +{ + const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); + // TODO: what is this used for? + // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); + bool use_mip = false; + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int x = cu_loc->x; + const int y = cu_loc->y; + int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma; + if (data->pred_cu.intra.mip_flag) { + if (color == COLOR_Y) { + use_mip = true; + } + else { + use_mip = state->encoder_control->chroma_format == UVG_CSP_444; + intra_mode = use_mip ? intra_mode : 0; + } + } + if (intra_mode < 68) { + if (use_mip) { + assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]"); + mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed); + } + else { + intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx); + } + } + else { + uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width); + if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) { + predict_cclm( + state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, + (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]); + } + else { + linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width); + } + } +} + + static void intra_recon_tb_leaf( - encoder_state_t *const state, + encoder_state_t* const state, int x, int y, int depth, - int8_t intra_mode, - cclm_parameters_t *cclm_params, lcu_t *lcu, color_t color, - uint8_t multi_ref_idx, - bool mip_flag, - bool mip_transp) + const intra_search_data_t* search_data) { const uvg_config *cfg = &state->encoder_control->cfg; const int shift = color == COLOR_Y ? 0 : 1; @@ -1383,7 +1447,7 @@ static void intra_recon_tb_leaf( int x_scu = SUB_SCU(x); int y_scu = SUB_SCU(y); const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift }; - uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0; + uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0; uvg_intra_references refs; // Extra reference lines for use with MRL. Extra lines needed only for left edge. @@ -1406,42 +1470,14 @@ static void intra_recon_tb_leaf( uvg_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index); uvg_pixel pred[32 * 32]; - int stride = state->tile->frame->source->stride; - const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); - bool use_mip = false; - if (mip_flag) { - if (color == COLOR_Y) { - use_mip = true; - } else { - // MIP can be used for chroma if the chroma scheme is 444 - if (state->encoder_control->chroma_format == UVG_CSP_444) { - use_mip = true; - } else { - // If MIP cannot be used for chroma, set mode to planar - intra_mode = 0; - } - } - } - if(intra_mode < 68) { - if (use_mip) { - assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]"); - uvg_mip_predict(state, &refs, width, height, pred, intra_mode, mip_transp); - } - else { - uvg_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index); - } - } else { - uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width); - if(cclm_params == NULL) { - cclm_parameters_t temp_params; - uvg_predict_cclm( - state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params); - } - else { - linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width); - } - } + cu_loc_t loc = { + x, y, + width, height, + width, height, + }; + + uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu); const int index = lcu_px.x + lcu_px.y * lcu_width; uvg_pixel *block = NULL; @@ -1483,17 +1519,12 @@ static void intra_recon_tb_leaf( * \param lcu containing LCU */ void uvg_intra_recon_cu( - encoder_state_t *const state, + encoder_state_t* const state, int x, int y, int depth, - int8_t mode_luma, - int8_t mode_chroma, + intra_search_data_t* search_data, cu_info_t *cur_cu, - cclm_parameters_t *cclm_params, - uint8_t multi_ref_idx, - bool mip_flag, - bool mip_transp, lcu_t *lcu) { const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) }; @@ -1501,12 +1532,16 @@ void uvg_intra_recon_cu( if (cur_cu == NULL) { cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); } - uint8_t multi_ref_index = multi_ref_idx; - bool use_mip = mip_flag; - bool mip_transposed = mip_transp; + const int8_t mode_luma = search_data->pred_cu.intra.mode; + const int8_t mode_chroma= search_data->pred_cu.intra.mode_chroma; + + if(mode_chroma != -1 && mode_luma == -1) { + x &= ~7; + y &= ~7; + } if (mode_luma != -1 && mode_chroma != -1) { - if (use_mip) { + if (search_data->pred_cu.intra.mip_flag) { assert(mode_luma == mode_chroma && "Chroma mode must be derived from luma mode if block uses MIP."); } } @@ -1527,10 +1562,10 @@ void uvg_intra_recon_cu( const int32_t x2 = x + offset; const int32_t y2 = y + offset; - uvg_intra_recon_cu(state, x, y, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); - uvg_intra_recon_cu(state, x2, y, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); - uvg_intra_recon_cu(state, x, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); - uvg_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu); + uvg_intra_recon_cu(state, x, y, depth + 1, search_data, NULL, lcu); + uvg_intra_recon_cu(state, x2, y, depth + 1, search_data, NULL, lcu); + uvg_intra_recon_cu(state, x, y2, depth + 1, search_data, NULL, lcu); + uvg_intra_recon_cu(state, x2, y2, depth + 1, search_data, NULL, lcu); // Propagate coded block flags from child CUs to parent CU. uint16_t child_cbfs[3] = { @@ -1552,13 +1587,15 @@ void uvg_intra_recon_cu( // Process a leaf TU. if (has_luma) { - intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data); } if (has_chroma) { - intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed); - intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data); + intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data); } - uvg_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false); + uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3), + search_data->pred_cu.joint_cb_cr != 4 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0), + x, y, depth, cur_cu, lcu, false); } } diff --git a/src/intra.h b/src/intra.h index dd90a87b..7f6c04d0 100644 --- a/src/intra.h +++ b/src/intra.h @@ -63,6 +63,18 @@ typedef struct int16_t b; } cclm_parameters_t; +typedef struct { + cu_info_t pred_cu; + cclm_parameters_t cclm_parameters[2]; + double cost; + double bits; + double coeff_bits; + double distortion; +} intra_search_data_t ; + + +#define UVG_NUM_INTRA_MODES 67 + /** * \brief Function for deriving intra luma predictions * \param x x-coordinate of the PU in pixels @@ -114,53 +126,22 @@ void uvg_intra_build_reference( * \param filter_boundary Whether to filter the boundary on modes 10 and 26. */ void uvg_intra_predict( - encoder_state_t *const state, - uvg_intra_references *refs, - int_fast8_t log2_width, - int_fast8_t mode, - color_t color, - uvg_pixel *dst, - bool filter_boundary, - const uint8_t multi_ref_idx); + const encoder_state_t* const state, + uvg_intra_references* const refs, + const cu_loc_t* const cu_loc, + const color_t color, + uvg_pixel* dst, + const intra_search_data_t* data, + const lcu_t* lcu +); void uvg_intra_recon_cu( - encoder_state_t *const state, + encoder_state_t* const state, int x, int y, int depth, - int8_t mode_luma, - int8_t mode_chroma, + intra_search_data_t* search_data, cu_info_t *cur_cu, - cclm_parameters_t* cclm_params, - uint8_t multi_ref_idx, - bool mip_flag, - bool mip_transp, lcu_t *lcu); - -void uvg_predict_cclm( - encoder_state_t const* const state, - const color_t color, - const int8_t width, - const int8_t height, - const int16_t x0, - const int16_t y0, - const int16_t stride, - const int8_t mode, - lcu_t* const lcu, - uvg_intra_references* chroma_ref, - uvg_pixel* dst, - cclm_parameters_t* cclm_params -); - int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a); - -void uvg_mip_predict( - encoder_state_t const * const state, - uvg_intra_references * refs, - const uint16_t width, - const uint16_t height, - uvg_pixel* dst, - const int mip_mode, - const bool mip_transp -); \ No newline at end of file diff --git a/src/rate_control.c b/src/rate_control.c index 27cc86ba..ca2215a5 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -1088,17 +1088,20 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, const encoder_control_t * const ctrl = state->encoder_control; lcu_stats_t *lcu = uvg_get_lcu_stats(state, pos.x, pos.y); - if (ctrl->cfg.roi.dqps != NULL) { - vector2d_t lcu = { + if (state->tile->frame->source->roi.roi_array) { + vector2d_t lcu_vec = { pos.x + state->tile->lcu_offset_x, pos.y + state->tile->lcu_offset_y }; vector2d_t roi = { - lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu, - lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu + lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu, + lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu }; - int roi_index = roi.x + roi.y * ctrl->cfg.roi.width; - int dqp = ctrl->cfg.roi.dqps[roi_index]; + int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width; + int dqp = state->tile->frame->source->roi.roi_array[roi_index]; + if(dqp != 0) { + pos.x = 0; + } state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lambda(state, state->qp); state->lambda_sqrt = sqrt(state->lambda); diff --git a/src/rdo.c b/src/rdo.c index 29bbdc97..8bad55a5 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -315,12 +315,12 @@ static INLINE uint32_t get_coeff_cabac_cost( // Take a copy of the CABAC so that we don't overwrite the contexts when // counting the bits. cabac_data_t cabac_copy; - memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy)); + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); // Clear bytes and bits and set mode to "count" cabac_copy.only_count = 1; - cabac_copy.num_buffered_bytes = 0; - cabac_copy.bits_left = 23; + int num_buffered_bytes = cabac_copy.num_buffered_bytes; + int bits_left = cabac_copy.bits_left; // Execute the coding function. // It is safe to drop the const modifier since state won't be modified @@ -343,8 +343,10 @@ static INLINE uint32_t get_coeff_cabac_cost( type, scan_mode); } - - return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3); + if(cabac_copy.update) { + memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy)); + } + return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3); } static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc) @@ -1741,37 +1743,33 @@ void uvg_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, /** * Calculate cost of actual motion vectors using CABAC coding */ -uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { cabac_data_t cabac_copy = *cabac; cabac_copy.only_count = 1; - + double bits = 0; // It is safe to drop const here because cabac->only_count is set. - uvg_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver); + uvg_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits); - uint32_t bitcost = - ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) - - ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)); - - return bitcost; + return bits; } /** MVD cost calculation with CABAC * \returns int * Calculates Motion Vector cost and related costs using CABAC coding */ -uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, - int x, - int y, - int mv_shift, - mv_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +double uvg_calc_mvd_cost_cabac(const encoder_state_t * state, + int x, + int y, + int mv_shift, + mv_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + double* bitcost) { cabac_data_t state_cabac_copy; cabac_data_t* cabac; @@ -1798,14 +1796,13 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, } // Store cabac state and contexts - memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t)); + memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t)); // Clear bytes and bits and set mode to "count" state_cabac_copy.only_count = 1; - state_cabac_copy.num_buffered_bytes = 0; - state_cabac_copy.bits_left = 23; cabac = &state_cabac_copy; + double bits = 0; if (!merged) { vector2d_t mvd1 = { @@ -1820,8 +1817,8 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1); uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2); - uint32_t cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); - uint32_t cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); + double cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); + double cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { @@ -1834,7 +1831,7 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); - CABAC_BIN(cabac, merged, "MergeFlag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag"); num_cand = state->encoder_control->cfg.max_merge; if (merged) { if (num_cand > 1) { @@ -1842,10 +1839,10 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, for (ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != merge_idx); if (ui == 0) { - cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model); - CABAC_BIN(cabac, symbol, "MergeIndex"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); } else { CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + bits += 1; } if (symbol == 0) break; } @@ -1868,24 +1865,23 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, if (ref_list[ref_list_idx] > 1) { // parseRefFrmIdx int32_t ref_frame = ref_idx; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX"); if (ref_frame > 0) { int32_t i; int32_t ref_num = ref_list[ref_list_idx] - 2; - - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); + ref_frame--; for (i = 0; i < ref_num; ++i) { const uint32_t symbol = (i == ref_frame) ? 0 : 1; if (i == 0) { - CABAC_BIN(cabac, symbol, "ref_idx_lX"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX"); } else { CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); + bits += 1; } if (symbol == 0) break; } @@ -1895,7 +1891,7 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { // It is safe to drop const here because cabac->only_count is set. - uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y); + uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits); } // Signal which candidate MV to use @@ -1905,10 +1901,10 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state, } } - *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); + *bitcost = bits; // Store bitcost before restoring cabac - return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5); + return *bitcost * state->lambda_sqrt; } void uvg_close_rdcost_outfiles(void) diff --git a/src/rdo.h b/src/rdo.h index 7a365254..46db8c90 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -77,10 +77,10 @@ uint32_t uvg_get_coded_level(encoder_state_t * state, double* coded_cost, double uvg_mvd_cost_func uvg_calc_mvd_cost_cabac; -uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state, - const cabac_data_t* cabac, - int32_t mvd_hor, - int32_t mvd_ver); +double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state, + const cabac_data_t* cabac, + int32_t mvd_hor, + int32_t mvd_ver); // Number of fixed point fractional bits used in the fractional bit table. #define CTX_FRAC_BITS 15 @@ -90,8 +90,5 @@ uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state, extern const uint32_t uvg_entropy_bits[512]; #define CTX_ENTROPY_BITS(ctx, val) uvg_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] -// Floating point fractional bits, derived from uvg_entropy_bits -extern const float uvg_f_entropy_bits[512]; -#define CTX_ENTROPY_FBITS(ctx, val) uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] #endif diff --git a/src/sao.c b/src/sao.c index e61d638e..e83b8117 100644 --- a/src/sao.c +++ b/src/sao.c @@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) { } -static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) +static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, none = 0 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type"); return mode_bits; } -static float sao_mode_bits_merge(const encoder_state_t * const state, +static double sao_mode_bits_merge(const encoder_state_t * const state, int8_t merge_cand) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag"); if (merge_cand == 1) return mode_bits; - mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2); + CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag"); return mode_bits; } -static float sao_mode_bits_edge(const encoder_state_t * const state, +static double sao_mode_bits_edge(const encoder_state_t * const state, int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { - ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + ctx = &(cabac->ctx.sao_merge_flag_model); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded type_idx_, edge = 2 = cMax ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) { @@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state, } -static float sao_mode_bits_band(const encoder_state_t * const state, +static double sao_mode_bits_band(const encoder_state_t * const state, int band_position[2], int offsets[10], sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt) { - float mode_bits = 0.0; - const cabac_data_t * const cabac = &state->cabac; - const cabac_ctx_t *ctx = NULL; + double mode_bits = 0.0; + cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = NULL; // FL coded merges. if (sao_left != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } if (sao_top != NULL) { ctx = &(cabac->ctx.sao_merge_flag_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 0); + CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag"); } // TR coded sao_type_idx_, band = 1 ctx = &(cabac->ctx.sao_type_idx_model); - mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0; + CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type"); + mode_bits += 1.0; // TR coded offsets and possible FL coded offset signs. for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) @@ -552,7 +554,8 @@ static void sao_search_best_mode(const encoder_state_t * const state, const uvg_ // Choose between SAO and doing nothing, taking into account the // rate-distortion cost of coding do nothing. { - int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5); + float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left); + int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5); if (sao_out->ddistortion >= cost_of_nothing) { sao_out->type = SAO_TYPE_NONE; merge_cost[0] = cost_of_nothing; diff --git a/src/search.c b/src/search.c index ac58ef99..e3845569 100644 --- a/src/search.c +++ b/src/search.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "imagelist.h" #include "inter.h" #include "intra.h" @@ -59,14 +60,6 @@ // Cost threshold for doing intra search in inter frames with --rd=0. static const int INTRA_THRESHOLD = 8; -// Modify weight of luma SSD. -#ifndef LUMA_MULT -# define LUMA_MULT 0.8 -#endif -// Modify weight of chroma SSD. -#ifndef CHROMA_MULT -# define CHROMA_MULT 1.5 -#endif static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to) { @@ -225,16 +218,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); double ssd = 0.0; - ssd += LUMA_MULT * uvg_pixels_calc_ssd( + ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], LCU_WIDTH, LCU_WIDTH, cu_width ); if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) { - ssd += CHROMA_MULT * uvg_pixels_calc_ssd( + ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); - ssd += CHROMA_MULT * uvg_pixels_calc_ssd( + ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 ); @@ -251,7 +244,8 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int x_scu = SUB_SCU(x); int y_scu = SUB_SCU(y); y_rec += x_scu + y_scu * LCU_WIDTH; - int stride = state->tile->frame->source->stride; + const int stride = state->tile->frame->rec->stride; + const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) { for (int x_ = 0; x_ < width; x_++) { @@ -265,13 +259,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, s += y_rec[2 * x_ + LCU_WIDTH] * 2; s += y_rec[2 * x_ + 1 + LCU_WIDTH]; s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH]; - int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2; + int index = x / 2 + x_ + (y / 2 + y_ )* stride2 / 2; state->tile->frame->cclm_luma_rec[index] = s >> 3; } y_rec += LCU_WIDTH * 2; } if((y + height * 2) % 64 == 0) { - int line = y / 64 * stride / 2; + int line = y / 64 * stride2 / 2; y_rec -= LCU_WIDTH; for (int i = 0; i < width; ++i) { int s = 2; @@ -294,11 +288,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, * prediction unit data needs to be coded. */ double uvg_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu) + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; // cur_cu is used for TU parameters. cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -324,14 +320,36 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state, return sum + tr_tree_bits * state->lambda; } + + if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) { + // Because these need to be coded before the luma cbf they also need to be counted + // before the cabac state changes. However, since this branch is only executed when + // calculating the last RD cost it is not problem to include the chroma cbf costs in + // luma, because the chroma cost is calculated right after the luma cost. + // However, if we have different tr_depth, the bits cannot be written in correct + // order anyways so do not touch the chroma cbf here. + if (state->encoder_control->chroma_format != UVG_CSP_400) { + cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = cr_ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + cr_ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]); + CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); + } + } + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = tr_cu->tr_depth - tr_cu->depth; if (pred_cu->type == CU_INTRA || - tr_depth > 0 || + is_tr_split || cbf_is_set(tr_cu->cbf, depth, COLOR_U) || cbf_is_set(tr_cu->cbf, depth, COLOR_V)) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[0]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y)); + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]); + int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); + + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); } // SSD between reconstruction and original @@ -343,7 +361,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state, width); } - { + + if (!skip_residual_coding) { int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; @@ -351,23 +370,22 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * LUMA_MULT + bits * state->lambda; + return (double)ssd * UVG_LUMA_MULT + bits * state->lambda; } double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - cu_info_t * pred_cu, - lcu_t *const lcu) + const int x_px, const int y_px, const int depth, + cu_info_t *const pred_cu, + lcu_t *const lcu) { const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 }; const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); double tr_tree_bits = 0; - double joint_cbcr_tr_tree_bits = 0; double coeff_bits = 0; - double joint_coeff_bits = 0; assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); @@ -378,30 +396,28 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, return 0; } - if (depth < MAX_PU_DEPTH) { + // See luma for why the second condition + if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) { const int tr_depth = depth - pred_cu->depth; - const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]); + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = ctx; if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); - } - if(state->encoder_control->cfg.jccr) { - joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1); + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); } int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); - ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]); + ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); - } - if(state->encoder_control->cfg.jccr) { - ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); - joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); } } if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); - int sum = 0; + double sum = 0; sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); @@ -418,15 +434,10 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, ctx = &(state->cabac.ctx.joint_cb_cr[cbf_mask]); tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 0); } - if(pred_cu->joint_cb_cr) { - ctx = &(state->cabac.ctx.joint_cb_cr[(pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1]); - joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 1); - } } // Chroma SSD int ssd = 0; - int joint_ssd = 0; if (!state->encoder_control->cfg.lossless) { int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], @@ -436,53 +447,266 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, LCU_WIDTH_C, LCU_WIDTH_C, width); ssd = ssd_u + ssd_v; + } - if(state->encoder_control->cfg.jccr) { + if (!skip_residual_coding) + { + int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + + coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0); + coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0); + } + + + double bits = tr_tree_bits + coeff_bits; + + return (double)ssd * UVG_CHROMA_MULT + bits * state->c_lambda; +} + +static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state, + const int x_px, const int y_px, const int depth, + const cu_info_t* const pred_cu, + lcu_t* const lcu) { + const int width = LCU_WIDTH >> depth; + + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + // cur_cu is used for TU parameters. + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + + double coeff_bits = 0; + double tr_tree_bits = 0; + + // Check that lcu is not in + assert(x_px >= 0 && x_px < LCU_WIDTH); + assert(y_px >= 0 && y_px < LCU_WIDTH); + + const uint8_t tr_depth = tr_cu->tr_depth - depth; + + const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V); + + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + + { + int cbf = cbf_is_set_any(pred_cu->cbf, depth); + // Only need to signal coded block flag if not skipped or merged + // skip = no coded residual, merge = coded residual + if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); + } + + } + + if(state->encoder_control->chroma_format != UVG_CSP_400 && !skip_residual_coding) { + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb"); + } + if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr"); + } + } + + if (tr_depth > 0) { + int offset = LCU_WIDTH >> (depth + 1); + double sum = 0; + + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + return sum + tr_tree_bits * state->lambda; + } + const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ; + + // Add transform_tree cbf_luma bit cost. + const int is_tr_split = depth - tr_cu->depth; + if ((pred_cu->type == CU_INTRA || + is_tr_split || + cb_flag_u || + cb_flag_v) + && !skip_residual_coding) + { + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]); + + CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); + } + + if (cb_flag_y | cb_flag_u | cb_flag_v) { + // TODO qp_delta_sign_flag + + if ((cb_flag_u | cb_flag_v) && x_px % 8 == 0 && y_px % 8 == 0 && state->encoder_control->cfg.jccr) { + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag"); + } + } + + + // SSD between reconstruction and original + unsigned luma_ssd = 0; + if (!state->encoder_control->cfg.lossless) { + int index = y_px * LCU_WIDTH + x_px; + luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width); + } + + { + int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); + const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + + coeff_bits += uvg_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode, tr_cu->tr_skip); + } + + unsigned chroma_ssd = 0; + if(state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 != 0 && y_px % 8 != 0))) { + const vector2d_t lcu_px = { (x_px & ~7 ) / 2, (y_px & ~7) / 2 }; + const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1)); + int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); + const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + if(pred_cu->joint_cb_cr == 0) { + if (!state->encoder_control->cfg.lossless) { + int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + chroma_width); + chroma_ssd = ssd_u + ssd_v; + } + + { + + coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0); + coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0); + } + } else { int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], LCU_WIDTH_C, LCU_WIDTH_C, width); int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); - joint_ssd = ssd_u_joint + ssd_v_joint; + chroma_width); + chroma_ssd = ssd_u_joint + ssd_v_joint; + coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0); } } + double bits = tr_tree_bits + coeff_bits; + return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + bits * state->lambda; +} + + +void uvg_select_jccr_mode( + const encoder_state_t* const state, + const int x_px, + const int y_px, + const int depth, + cu_info_t* pred_cu, + lcu_t* const lcu, + double* cost_out) +{ + const vector2d_t lcu_px = { (SUB_SCU(x_px) & ~7) / 2, (SUB_SCU(y_px) & ~7) / 2 }; + const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + if (pred_cu == NULL) pred_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x_px), SUB_SCU(y_px)); + assert(pred_cu->depth == pred_cu->tr_depth && "jccr does not support transform splitting"); + if (cost_out == NULL && pred_cu->joint_cb_cr == 0) { + return; + } + + double tr_tree_bits = 0; + double joint_cbcr_tr_tree_bits = 0; + double coeff_bits = 0; + double joint_coeff_bits = 0; + + assert(lcu_px.x >= 0 && lcu_px.x < LCU_WIDTH_C); + assert(lcu_px.y >= 0 && lcu_px.y < LCU_WIDTH_C); + + if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) { + // For MAX_PU_DEPTH calculate chroma for previous depth for the first + // block and return 0 cost for all others. + return; + } + + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = ctx; + int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U); + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]); + int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search"); + + int cbf_mask = u_is_set * 2 + v_is_set - 1; + if((cbf_mask != -1 && pred_cu->type == CU_INTRA) || cbf_mask == 2) + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag"); + + if(pred_cu->joint_cb_cr) { + const int u_jccr = (pred_cu->joint_cb_cr >> 1) & 1; + ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + CABAC_FBITS_UPDATE(cabac, ctx, u_jccr, joint_cbcr_tr_tree_bits, "cbf_cb_search"); + ctx = &(cabac->ctx.qt_cbf_model_cr[u_jccr]); + CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cr_search"); + cbf_mask = pred_cu->joint_cb_cr - 1; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag"); + } + unsigned ssd = 0; + unsigned joint_ssd = 0; + if (!state->encoder_control->cfg.lossless) { + const int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; + const unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + const unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + ssd = ssd_u + ssd_v; + + if (pred_cu->joint_cb_cr) { + const unsigned ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + const unsigned ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width); + joint_ssd = ssd_u_joint + ssd_v_joint; + } + } + { int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0); coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0); - - if(state->encoder_control->cfg.jccr) { - joint_coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0); - } + + joint_coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0); } double bits = tr_tree_bits + coeff_bits; double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits; - double cost = (double)ssd + bits * state->c_lambda; - double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda; + double cost = (double)ssd * UVG_CHROMA_MULT + bits * state->c_lambda; + double joint_cost = (double)joint_ssd * UVG_CHROMA_MULT + joint_bits * state->c_lambda; if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) { pred_cu->joint_cb_cr = 0; - return cost; + if (cost_out) *cost_out += cost; + return; } cbf_clear(&pred_cu->cbf, depth, COLOR_U); cbf_clear(&pred_cu->cbf, depth, COLOR_V); - if (pred_cu->joint_cb_cr & 1) { + if (pred_cu->joint_cb_cr & 2) { cbf_set(&pred_cu->cbf, depth, COLOR_U); } - if (pred_cu->joint_cb_cr & 2) { + if (pred_cu->joint_cb_cr & 1) { cbf_set(&pred_cu->cbf, depth, COLOR_V); } int lcu_width = LCU_WIDTH_C; const int index = lcu_px.x + lcu_px.y * lcu_width; uvg_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width); uvg_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width); - return joint_cost; + if (cost_out) *cost_out += joint_cost; } @@ -492,23 +716,9 @@ static double calc_mode_bits(const encoder_state_t *state, const cu_info_t * cur_cu, int x, int y, int depth) { - int x_local = SUB_SCU(x); - int y_local = SUB_SCU(y); - assert(cur_cu->type == CU_INTRA); - int8_t candidate_modes[INTRA_MPM_COUNT]; - { - const cu_info_t *left_cu = ((x >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local - SCU_WIDTH, y_local) : NULL); - const cu_info_t *above_cu = ((y >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local, y_local - SCU_WIDTH) : NULL); - uvg_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu); - } - - int width = LCU_WIDTH >> depth; - int height = width; // TODO: height for non-square blocks - int num_mip_modes_half = NUM_MIP_MODES_HALF(width, height); - int mip_flag_ctx_id = uvg_get_mip_flag_context(x, y, width, height, lcu, NULL); - double mode_bits = uvg_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, num_mip_modes_half, mip_flag_ctx_id); + double mode_bits = uvg_luma_mode_bits(state, cur_cu, x, y, depth, lcu); if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) { mode_bits += uvg_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode); @@ -518,6 +728,7 @@ static double calc_mode_bits(const encoder_state_t *state, } +// TODO: replace usages of this by the uvg_sort_indices_by_cost function. /** * \brief Sort modes and costs to ascending order according to costs. */ @@ -567,16 +778,25 @@ void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict traf } } - - -static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) +/** + * \brief Sort keys (indices) to ascending order according to costs. + */ +void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map) { - vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) }; - bool condA = x >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x - 1, lcu_cu.y )->depth > depth; - bool condL = y >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x, lcu_cu.y - 1)->depth > depth; - return condA + condL; + // Size of sorted arrays is expected to be "small". No need for faster algorithm. + for (uint8_t i = 1; i < map->size; ++i) { + const int8_t cur_indx = map->keys[i]; + const double cur_cost = map->cost[cur_indx]; + uint8_t j = i; + while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) { + map->keys[j] = map->keys[j - 1]; + --j; + } + map->keys[j] = cur_indx; + } } + /** * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode. * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH. @@ -592,10 +812,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const encoder_control_t* ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; - double cost = MAX_INT; - double inter_zero_coeff_cost = MAX_INT; - uint32_t inter_bitcost = MAX_INT; + double cost = MAX_DOUBLE; + double inter_zero_coeff_cost = MAX_DOUBLE; + double inter_bitcost = MAX_INT; cu_info_t *cur_cu; + cabac_data_t pre_search_cabac; + memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; @@ -626,7 +848,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // Assign correct depth limit constraint_t* constr = state->constraint; - if(constr->ml_intra_depth_ctu) { + if(constr->ml_intra_depth_ctu) { pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8]; pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8]; } @@ -670,7 +892,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, if (can_use_inter) { double mode_cost; - uint32_t mode_bitcost; + double mode_bitcost; uvg_search_cu_inter(state, x, y, depth, @@ -693,33 +915,34 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max; bool can_use_intra = - WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || + (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || // When the split was forced because the CTU is partially outside // the frame, we permit intra coding even if pu_depth_intra would // otherwise forbid it. (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width || - (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height; + (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) && + !(state->encoder_control->cfg.force_inter && state->frame->slicetype != UVG_SLICE_I); + intra_search_data_t intra_search; if (can_use_intra && !skip_intra) { - int8_t intra_mode; - int8_t intra_trafo; - double intra_cost; - uint8_t multi_ref_index = 0; - bool mip_flag = false; - bool mip_transposed = false; - uvg_search_cu_intra(state, x, y, depth, lcu, - &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed); - if (intra_cost < cost) { - cost = intra_cost; + intra_search.pred_cu = *cur_cu; + intra_search.pred_cu.joint_cb_cr = 4; + uvg_search_cu_intra(state, x, y, depth, &intra_search, + lcu); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default + if(state->frame->slicetype != UVG_SLICE_I) { + double pred_mode_type_bits = 0; + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag"); + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag"); + intra_cost += pred_mode_type_bits * state->lambda; + } +#endif + if (intra_search.cost < cost) { + cost = intra_search.cost; + *cur_cu = intra_search.pred_cu; cur_cu->type = CU_INTRA; - cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N; - cur_cu->intra.mode = intra_mode; - cur_cu->intra.multi_ref_idx = multi_ref_index; - cur_cu->intra.mip_flag = mip_flag; - cur_cu->intra.mip_is_transposed = mip_transposed; - - //If the CU is not split from 64x64 block, the MTS is disabled for that CU. - cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0; } } @@ -727,20 +950,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // mode search of adjacent CUs. if (cur_cu->type == CU_INTRA) { assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN); - cur_cu->intra.mode_chroma = cur_cu->intra.mode; - + + intra_search.pred_cu.intra.mode_chroma = -1; // don't reconstruct chroma before search is performed for it lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); uvg_intra_recon_cu(state, x, y, - depth, - cur_cu->intra.mode, -1, // skip chroma - NULL, NULL, cur_cu->intra.multi_ref_idx, - cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, + depth, &intra_search, + NULL, lcu); downsample_cclm_rec( state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] ); + cur_cu->joint_cb_cr = 0; // TODO: This heavily relies to square CUs if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400) { @@ -748,19 +970,47 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - cclm_parameters_t cclm_params[2]; + intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; // skip luma if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) { - cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params); + cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search); + + if (intra_search.pred_cu.joint_cb_cr == 0) intra_search.pred_cu.joint_cb_cr = 4; + else cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr; + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } - + intra_search.pred_cu.intra.mode = -1; // skip luma uvg_intra_recon_cu(state, - x & ~7, y & ~7, // TODO: as does this - depth, - -1, cur_cu->intra.mode_chroma, // skip luma - NULL, cclm_params, 0, - cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, + x, y, // TODO: as does this + depth, &intra_search, + NULL, lcu); + if(depth != 0 && state->encoder_control->cfg.jccr && ctrl->cfg.rdo < 3) { + uvg_select_jccr_mode(state, + x, y, + depth, + NULL, + lcu, + NULL); + } + else if(depth != 0 && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr & 3) { + assert(cur_cu->joint_cb_cr < 4); + cbf_clear(&cur_cu->cbf, depth, COLOR_U); + cbf_clear(&cur_cu->cbf, depth, COLOR_V); + if (cur_cu->joint_cb_cr & 2) { + cbf_set(&cur_cu->cbf, depth, COLOR_U); + } + if (cur_cu->joint_cb_cr & 1) { + cbf_set(&cur_cu->cbf, depth, COLOR_V); + } + const vector2d_t lcu_px = { (x_local & ~7) / 2, (y_local & ~7) / 2 }; + int lcu_width = LCU_WIDTH_C; + const int index = lcu_px.x + lcu_px.y * lcu_width; + const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; + uvg_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width); + uvg_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width); + + } } } else if (cur_cu->type == CU_INTER) { @@ -788,11 +1038,20 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } uvg_quantize_lcu_residual(state, - true, has_chroma, - x, y, depth, - NULL, - lcu, - false); + true, has_chroma, + state->encoder_control->cfg.jccr, x, y, + depth, + NULL, + lcu, + false); + if (cur_cu->depth == cur_cu->tr_depth && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr) { + uvg_select_jccr_mode(state, + x, y, + depth, + NULL, + lcu, + NULL); + } int cbf = cbf_is_set_any(cur_cu->cbf, depth); @@ -800,9 +1059,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU - if (inter_bitcost > 1) { - inter_bitcost -= 1; - } + int skip_ctx = uvg_get_skip_context(x, y, lcu, NULL, NULL); + inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1); + inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0); + inter_bitcost += cur_cu->merge_idx; } } lcu_fill_inter(lcu, x_local, y_local, cu_width); @@ -811,20 +1071,26 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { - cost = uvg_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); - if (state->encoder_control->chroma_format != UVG_CSP_400) { - cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); + double bits = 0; + cabac_data_t* cabac = &state->search_cabac; + cabac->update = 1; + + if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) { + bits += uvg_mock_encode_coding_unit( + state, + cabac, + x, y, depth, + lcu, + cur_cu); } - - double mode_bits; - if (cur_cu->type == CU_INTRA) { - mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth); - } else { - mode_bits = inter_bitcost; + else { + assert(0); } + + cost = bits * state->lambda; - cost += mode_bits * state->lambda; - + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); + if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { cost = inter_zero_coeff_cost; @@ -846,13 +1112,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, cur_cu->cbf = 0; lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } - } + cabac->update = 0; + } bool can_split_cu = // If the CU is partially outside the frame, we need to split it even // if pu_depth_intra and pu_depth_inter would not permit it. cur_cu->type == CU_NOTSET || - depth < pu_depth_intra.max || + (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) || (state->frame->slicetype != UVG_SLICE_I && depth < pu_depth_inter.max); @@ -861,21 +1128,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int half_cu = cu_width / 2; double split_cost = 0.0; int cbf = cbf_is_set_any(cur_cu->cbf, depth); + cabac_data_t post_seach_cabac; + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); + state->search_cabac.update = 1; + + double split_bits = 0; if (depth < MAX_DEPTH) { // Add cost of cu_split_flag. - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; + uvg_write_split_flag(state, &state->search_cabac, + x > 0 ? LCU_GET_CU_AT_PX(lcu,SUB_SCU(x) -1, SUB_SCU(y)): NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, + 1, depth, cu_width, x, y, &split_bits); } - if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { - // Add cost of intra part_size. - const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]); - cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda; // 2Nx2N - split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; // NxN - } + state->search_cabac.update = 0; + split_cost += split_bits * state->lambda; // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting @@ -897,13 +1166,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // searching. if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH - && x + cu_width <= frame->width && y + cu_width <= frame->height && 0) + && x + cu_width <= frame->width && y + cu_width <= frame->height + && state->encoder_control->cfg.combine_intra_cus) { + cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); // If the best CU in depth+1 is intra and the biggest it can be, try it. if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { + cabac_data_t temp_cabac; + memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac)); + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; + double bits = 0; + uvg_write_split_flag(state, &state->search_cabac, + x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, + 0, depth, cu_width, x, y, & split_bits); cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; @@ -914,29 +1193,25 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + + intra_search_data_t proxy; + FILL(proxy, 0); + proxy.pred_cu = *cur_cu; - const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1; uvg_intra_recon_cu(state, x, y, depth, - cur_cu->intra.mode, mode_chroma, - NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, + &proxy, + NULL, lcu); - cost += uvg_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); - if (has_chroma) { - cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu); - } - - // Add the cost of coding no-split. - uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth); - const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]); - cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda; - - // Add the cost of coding intra mode only once. - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth); + double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits; cost += mode_bits * state->lambda; + + cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu); + + memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); } } @@ -950,6 +1225,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if (depth > 0) { // Copy this CU's mode all the way down for use in adjacent CUs mode // search. + memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac)); work_tree_copy_down(x_local, y_local, depth, work_tree); downsample_cclm_rec( state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] @@ -962,6 +1238,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu); } } + else { + downsample_cclm_rec( + state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] + ); + } } else if (depth >= 0 && depth < MAX_PU_DEPTH) { // Need to copy modes down since the lower level of the work tree is used // when searching SMP and AMP blocks. @@ -1139,6 +1420,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i */ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf, lcu_coeff_t *coeff) { + memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t)); + state->search_cabac.only_count = 1; assert(x % LCU_WIDTH == 0); assert(y % LCU_WIDTH == 0); diff --git a/src/search.h b/src/search.h index 85e76d23..9b4d92f7 100644 --- a/src/search.h +++ b/src/search.h @@ -44,22 +44,62 @@ #include "image.h" #include "constraint.h" -#define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12) -#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1 +#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS) + + // Modify weight of luma SSD. +#ifndef UVG_LUMA_MULT +#define UVG_LUMA_MULT 0.8 +#endif +// Modify weight of chroma SSD. +#ifndef UVG_CHROMA_MULT +#define UVG_CHROMA_MULT 1.5 +#endif + + /** + * \brief Data collected during search processes. + * + * The intended use is to collect statistics of the + * searched coding/prediction units. Data related to + * a specific unit is found at index i. The arrays + * should be indexed by elements of the "keys" array + * that will be sorted by the RD costs of the units. + */ +typedef struct unit_stats_map_t { + + cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units + double cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs + double bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs + int8_t keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays + int size; //!< number of active elements in the lists +} unit_stats_map_t; + +#define NUM_MIP_MODES_FULL(width, height) (((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)) +#define NUM_MIP_MODES_HALF(width, height) (NUM_MIP_MODES_FULL((width), (height)) >> 1) void uvg_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length); +void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map); + void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff); double uvg_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu); + const int x_px, const int y_px, const int depth, + const cu_info_t *const pred_cu, + lcu_t *const lcu); double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - cu_info_t * pred_cu, - lcu_t *const lcu); + const int x_px, const int y_px, const int depth, + cu_info_t *const pred_cu, + lcu_t *const lcu); +void uvg_select_jccr_mode( + const encoder_state_t* const state, + const int x_px, + const int y_px, + const int depth, + cu_info_t* const pred_cu, + lcu_t* const lcu, + double* cost_out); + void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); diff --git a/src/search_inter.c b/src/search_inter.c index a6feb1f5..836f45e4 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" +#include "encode_coding_tree.h" #include "image.h" #include "imagelist.h" #include "inter.h" @@ -68,7 +69,7 @@ typedef struct { /** * \brief Top-left corner of the PU */ - const vector2d_t origin; + vector2d_t origin; int32_t width; int32_t height; @@ -78,19 +79,6 @@ typedef struct { uvg_mvd_cost_func *mvd_cost_func; - /** - * \brief Best motion vector among the ones tested so far - */ - vector2d_t best_mv; - /** - * \brief Cost of best_mv - */ - uint32_t best_cost; - /** - * \brief Bit cost of best_mv - */ - uint32_t best_bitcost; - /** * \brief Possible optimized SAD implementation for the width, leave as * NULL for arbitrary-width blocks @@ -205,20 +193,25 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int /** * \brief Calculate cost for an integer motion vector. * - * Updates info->best_mv, info->best_cost and info->best_bitcost to the new + * Updates best_mv, best_cost and best_bitcost to the new * motion vector if it yields a lower cost than the current one. * * If the motion vector violates the MV constraints for tiles or WPP, the * cost is not set. * - * \return true if info->best_mv was changed, false otherwise + * \return true if best_mv was changed, false otherwise */ -static bool check_mv_cost(inter_search_info_t *info, int x, int y) +static bool check_mv_cost(inter_search_info_t *info, + int x, + int y, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { if (!intmv_within_tile(info, x, y)) return false; - uint32_t bitcost = 0; - uint32_t cost = uvg_image_calc_sad( + double bitcost = 0; + double cost = uvg_image_calc_sad( info->pic, info->ref, info->origin.x, @@ -230,25 +223,25 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) info->optimized_sad ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; cost += info->mvd_cost_func( info->state, x, y, INTERNAL_MV_PREC, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcost ); - if (cost >= info->best_cost) return false; + if (cost >= *best_cost) return false; // Set to motion vector in internal pixel precision. - info->best_mv.x = x * (1 << INTERNAL_MV_PREC); - info->best_mv.y = y * (1 << INTERNAL_MV_PREC); - info->best_cost = cost; - info->best_bitcost = bitcost; + best_mv->x = x * (1 << INTERNAL_MV_PREC); + best_mv->y = y * (1 << INTERNAL_MV_PREC); + *best_cost = cost; + *best_bits = bitcost; return true; } @@ -256,10 +249,10 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y) static unsigned get_ep_ex_golomb_bitcost(unsigned symbol) { - // Calculate 2 * log2(symbol + 2) + // Calculate 2 * log2(symbol ) unsigned bins = 0; - symbol += 2; + symbol += 0; if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; } if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; } if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; } @@ -299,12 +292,16 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv) * \brief Select starting point for integer motion estimation search. * * Checks the zero vector, extra_mv and merge candidates and updates - * info->best_mv to the best one. + * best_mv to the best one. */ -static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv) +static void select_starting_point(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. - check_mv_cost(info, 0, 0); + check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv); // Change to integer precision. extra_mv.x >>= INTERNAL_MV_PREC; @@ -312,7 +309,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv // Check mv_in if it's not one of the merge candidates. if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) { - check_mv_cost(info, extra_mv.x, extra_mv.y); + check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); } // Go through candidates @@ -324,49 +321,26 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv if (x == 0 && y == 0) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } -static uint32_t get_mvd_coding_cost(const encoder_state_t *state, - const cabac_data_t* cabac, - const int32_t mvd_hor, - const int32_t mvd_ver) +static double get_mvd_coding_cost(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { - unsigned bitcost = 0; - - const int8_t hor_abs_gr0 = mvd_hor != 0; - const int8_t ver_abs_gr0 = mvd_ver != 0; - const uint32_t mvd_hor_abs = abs(mvd_hor); - const uint32_t mvd_ver_abs = abs(mvd_ver); - - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], (mvd_hor != 0)); - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], (mvd_ver != 0)); - - if (hor_abs_gr0) { - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], (mvd_hor_abs > 1)); - } - if (ver_abs_gr0) { - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], (mvd_ver_abs > 1)); - } - - if (hor_abs_gr0) { - if (mvd_hor_abs > 1) { - bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs - 2) << CTX_FRAC_BITS; - } - bitcost += CTX_FRAC_ONE_BIT; - } - if (ver_abs_gr0) { - if (mvd_ver_abs > 1) { - bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs - 2) << CTX_FRAC_BITS; - } - bitcost += CTX_FRAC_ONE_BIT; - } + double bitcost = 4 << CTX_FRAC_BITS; + const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; + bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; // Round and shift back to integer bits. - return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; + return bitcost / (1 << CTX_FRAC_BITS); } @@ -374,7 +348,7 @@ static int select_mv_cand(const encoder_state_t *state, mv_t mv_cand[2][2], int32_t mv_x, int32_t mv_y, - uint32_t *cost_out) + double*cost_out) { const bool same_cand = (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]); @@ -384,7 +358,7 @@ static int select_mv_cand(const encoder_state_t *state, return 0; } - uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, + double (*mvd_coding_cost)(const encoder_state_t * const state, const cabac_data_t*, int32_t, int32_t); if (state->encoder_control->cfg.mv_rdo) { @@ -397,12 +371,12 @@ static int select_mv_cand(const encoder_state_t *state, uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd); - uint32_t cand1_cost = mvd_coding_cost( + double cand1_cost = mvd_coding_cost( state, &state->cabac, mvd.x, mvd.y); - uint32_t cand2_cost; + double cand2_cost; if (same_cand) { cand2_cost = cand1_cost; } else { @@ -423,17 +397,17 @@ static int select_mv_cand(const encoder_state_t *state, } -static uint32_t calc_mvd_cost(const encoder_state_t *state, - int x, - int y, - int mv_shift, - mv_t mv_cand[2][2], - inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], - int16_t num_cand, - int32_t ref_idx, - uint32_t *bitcost) +static double calc_mvd_cost(const encoder_state_t *state, + int x, + int y, + int mv_shift, + mv_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + double* bitcost) { - uint32_t temp_bitcost = 0; + double temp_bitcost = 0; uint32_t merge_idx; int8_t merged = 0; @@ -456,23 +430,26 @@ static uint32_t calc_mvd_cost(const encoder_state_t *state, // Check mvd cost only if mv is not merged if (!merged) { - uint32_t mvd_cost = 0; + double mvd_cost = 0; select_mv_cand(state, mv_cand, x, y, &mvd_cost); temp_bitcost += mvd_cost; } *bitcost = temp_bitcost; - return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5); + return temp_bitcost * state->lambda_sqrt; } -static bool early_terminate(inter_search_info_t *info) +static bool early_terminate(inter_search_info_t *info, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { static const vector2d_t small_hexbs[7] = { { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, { 0, -1 }, { -1, 0 }, { 0, 0 }, }; - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; int first_index = 0; int last_index = 3; @@ -482,9 +459,9 @@ static bool early_terminate(inter_search_info_t *info) if (info->state->encoder_control->cfg.me_early_termination == UVG_ME_EARLY_TERMINATION_SENSITIVE) { - threshold = info->best_cost * 0.95; + threshold = *best_cost * 0.95; } else { - threshold = info->best_cost; + threshold = *best_cost; } int best_index = 6; @@ -492,7 +469,7 @@ static bool early_terminate(inter_search_info_t *info) int x = mv.x + small_hexbs[i].x; int y = mv.y + small_hexbs[i].y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -502,7 +479,7 @@ static bool early_terminate(inter_search_info_t *info) mv.y += small_hexbs[best_index].y; // If best match is not better than threshold, we stop the search. - if (info->best_cost >= threshold) { + if (*best_cost >= threshold) { return true; } @@ -517,7 +494,10 @@ void uvg_tz_pattern_search(inter_search_info_t *info, unsigned pattern_type, const int iDist, vector2d_t mv, - int *best_dist) + int *best_dist, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { assert(pattern_type < 4); @@ -619,7 +599,7 @@ void uvg_tz_pattern_search(inter_search_info_t *info, int x = mv.x + offset.x; int y = mv.y + offset.y; - if (check_mv_cost(info, x, y)) { + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -632,20 +612,27 @@ void uvg_tz_pattern_search(inter_search_info_t *info, void uvg_tz_raster_search(inter_search_info_t *info, int iSearchRange, - int iRaster) + int iRaster, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { - const vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + const vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; //compute SAD values for every point in the iRaster downsampled version of the current search area for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) { for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) { - check_mv_cost(info, mv.x + x, mv.y + y); + check_mv_cost(info, mv.x + x, mv.y + y, best_cost, best_bits, best_mv); } } } -static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) +static void tz_search(inter_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { //TZ parameters const int iSearchRange = 96; // search range for each stage @@ -657,25 +644,13 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; - info->best_cost = UINT32_MAX; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) - { - return; - } - - vector2d_t start = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + + vector2d_t start = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; // step 2, grid search int rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); // Break the loop if the last three rounds didn't produce a better MV. if (best_dist != iDist) rounds_without_improvement++; @@ -688,7 +663,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) start.y = 0; rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) { - uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); if (best_dist != iDist) rounds_without_improvement++; if (rounds_without_improvement >= 3) break; @@ -698,7 +673,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //step 3, raster scan if (use_raster_scan && best_dist > iRaster) { best_dist = iRaster; - uvg_tz_raster_search(info, iSearchRange, iRaster); + uvg_tz_raster_search(info, iSearchRange, iRaster, best_cost, best_bits, best_mv); } //step 4 @@ -706,19 +681,19 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) //raster refinement if (use_raster_refinement && best_dist > 0) { for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) { - start.x = info->best_mv.x >> INTERNAL_MV_PREC; - start.y = info->best_mv.y >> INTERNAL_MV_PREC; - uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + start.x = best_mv->x >> INTERNAL_MV_PREC; + start.y = best_mv->y >> INTERNAL_MV_PREC; + uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } //star refinement (repeat step 2 for the current starting point) while (use_star_refinement && best_dist > 0) { best_dist = 0; - start.x = info->best_mv.x >> INTERNAL_MV_PREC; - start.y = info->best_mv.y >> INTERNAL_MV_PREC; + start.x = best_mv->x >> INTERNAL_MV_PREC; + start.y = best_mv->y >> INTERNAL_MV_PREC; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist); + uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv); } } } @@ -740,7 +715,12 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv) * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. */ -static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void hexagon_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -765,27 +745,14 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; - info->best_cost = UINT32_MAX; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) - { - return; - } - - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; // Current best index, either to merge_cands, large_hexbs or small_hexbs. int best_index = 0; // Search the initial 7 points of the hexagon. for (int i = 1; i < 7; ++i) { - if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y)) { + if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -814,7 +781,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Iterate through the next 3 points. for (int i = 0; i < 3; ++i) { vector2d_t offset = large_hexbs[start + i]; - if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) { + if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) { best_index = start + i; } } @@ -826,7 +793,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // Do the final step of the search with a small pattern. for (int i = 1; i < 9; ++i) { - check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y); + check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv); } } @@ -846,7 +813,12 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. **/ -static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +static void diamond_search(inter_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { enum diapos { DIA_UP = 0, @@ -864,29 +836,16 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 {0, -1}, {1, 0}, {0, 1}, {-1, 0}, {0, 0} }; - - info->best_cost = UINT32_MAX; - - // Select starting point from among merge candidates. These should - // include both mv_cand vectors and (0, 0). - select_starting_point(info, extra_mv); - - // Check if we should stop search - if (info->state->encoder_control->cfg.me_early_termination && - early_terminate(info)) - { - return; - } // current motion vector - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; // current best index enum diapos best_index = DIA_CENTER; // initial search of the points of the diamond for (int i = 0; i < 5; ++i) { - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; } } @@ -916,7 +875,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 // this is where we came from so it's checked already if (i == from_dir) continue; - if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { best_index = i; better_found = 1; } @@ -938,12 +897,15 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3 static void search_mv_full(inter_search_info_t *info, int32_t search_range, - vector2d_t extra_mv) + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) { // Search around the 0-vector. for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } @@ -955,7 +917,7 @@ static void search_mv_full(inter_search_info_t *info, if (!mv_in_merge(info, extra_mv)) { for (int y = -search_range; y <= search_range; y++) { for (int x = -search_range; x <= search_range; x++) { - check_mv_cost(info, extra_mv.x + x, extra_mv.y + y); + check_mv_cost(info, extra_mv.x + x, extra_mv.y + y, best_cost, best_bits, best_mv); } } } @@ -1002,7 +964,7 @@ static void search_mv_full(inter_search_info_t *info, } if (already_tested) continue; - check_mv_cost(info, x, y); + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); } } } @@ -1015,7 +977,10 @@ static void search_mv_full(inter_search_info_t *info, * Algoritm first searches 1/2-pel positions around integer mv and after best match is found, * refines the search by searching best 1/4-pel postion around best 1/2-pel position. */ -static void search_frac(inter_search_info_t *info) +static void search_frac(inter_search_info_t *info, + double *best_cost, + double *best_bits, + vector2d_t *best_mv) { // Map indexes to relative coordinates in the following way: // 5 3 6 @@ -1028,13 +993,14 @@ static void search_frac(inter_search_info_t *info) }; // Set mv to pixel precision - vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC }; + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; - unsigned best_cost = UINT32_MAX; - uint32_t best_bitcost = 0; - uint32_t bitcosts[4] = { 0 }; + double cost = MAX_DOUBLE; + double bitcost = 0; + double bitcosts[4] = { 0 }; unsigned best_index = 0; +// Keep this as unsigned until SAD / SATD functions are updated unsigned costs[4] = { 0 }; ALIGNED(64) uvg_pixel filtered[4][LCU_LUMA_SIZE]; @@ -1100,12 +1066,12 @@ static void search_frac(inter_search_info_t *info) costs[0] += info->mvd_cost_func(state, mv.x, mv.y, INTERNAL_MV_PREC, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[0]); - best_cost = costs[0]; - best_bitcost = bitcosts[0]; + cost = costs[0]; + bitcost = bitcosts[0]; //Set mv to half-pixel precision mv.x *= 2; @@ -1160,8 +1126,8 @@ static void search_frac(inter_search_info_t *info) mv.y + pattern[j]->y, mv_shift, info->mv_cand, - info->merge_cand, - info->num_merge_cand, + NULL, + 0, info->ref_idx, &bitcosts[j] ); @@ -1169,9 +1135,9 @@ static void search_frac(inter_search_info_t *info) } for (int j = 0; j < 4; ++j) { - if (within_tile[j] && costs[j] < best_cost) { - best_cost = costs[j]; - best_bitcost = bitcosts[j]; + if (within_tile[j] && costs[j] < cost) { + cost = costs[j]; + bitcost = bitcosts[j]; best_index = i + j; } } @@ -1201,9 +1167,38 @@ static void search_frac(inter_search_info_t *info) mv.x *= 1 << (INTERNAL_MV_PREC - 2); mv.y *= 1 << (INTERNAL_MV_PREC - 2); - info->best_mv = mv; - info->best_cost = best_cost; - info->best_bitcost = best_bitcost; + *best_mv = mv; + *best_cost = cost; + *best_bits = bitcost; +} + +int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx) { + assert(!(lcu && cu_a)); + int context = 0; + const cu_info_t* left_pu = NULL; + const cu_info_t* top_pu = NULL; + if(lcu) { + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + if (x) { + left_pu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + if (y) { + top_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1); + } + } + else { + if (x > 0) { + left_pu = uvg_cu_array_at_const(cu_a, x - 1, y); + } + if (y > 0) { + top_pu = uvg_cu_array_at_const(cu_a, x, y - 1); + } + } + context += left_pu && left_pu->skipped; + context += top_pu && top_pu->skipped; + if (predmode_ctx) *predmode_ctx = (left_pu && left_pu->type == CU_INTRA) || (top_pu && top_pu->type == CU_INTRA); + return context; } /** @@ -1251,46 +1246,37 @@ static void apply_mv_scaling(int32_t current_poc, */ static void search_pu_inter_ref(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost, - double *best_LX_cost, - cu_info_t *unipred_LX) + lcu_t *lcu, + cu_info_t *cur_cu, + unit_stats_map_t *amvp) { const uvg_config *cfg = &info->state->encoder_control->cfg; - // which list, L0 or L1, ref_idx is in and in what index - int8_t ref_list = -1; - // the index of the ref_idx in L0 or L1 list - int8_t LX_idx; - // max value of LX_idx plus one - const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0], - info->state->frame->ref_LX_size[1]); + // Reference picture might be in both lists + bool ref_list_active[2] = { false, false }; + // Reference picture indices in L0 and L1 lists + int8_t ref_list_idx[2] = { -1, -1 }; - for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++) - { - // check if ref_idx is in L0 - if (LX_idx < info->state->frame->ref_LX_size[0] && - info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) { - ref_list = 0; - break; - } - - // check if ref_idx is in L1 - if (LX_idx < info->state->frame->ref_LX_size[1] && - info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) { - ref_list = 1; - break; + // Check if ref picture is present in the lists + for (int ref_list = 0; ref_list < 2; ++ref_list) { + for (int i = 0; i < info->state->frame->ref_LX_size[ref_list]; ++i) { + if (info->state->frame->ref_LX[ref_list][i] == info->ref_idx) { + ref_list_active[ref_list] = true; + ref_list_idx[ref_list] = i; + break; + } } } - // ref_idx has to be found in either L0 or L1 - assert(LX_idx < LX_IDX_MAX_PLUS_1); - // store temp values to be stored back later - int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list]; + // Must find at least one reference picture + assert(ref_list_active[0] || ref_list_active[1]); + + // Does not matter which list is used, if in both. + int ref_list = ref_list_active[0] ? 0 : 1; + int LX_idx = ref_list_idx[ref_list]; // Get MV candidates - cur_cu->inter.mv_ref[ref_list] = LX_idx; + cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; uvg_inter_get_mv_cand(info->state, info->origin.x, @@ -1302,10 +1288,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, lcu, ref_list); - // store old values back - cur_cu->inter.mv_ref[ref_list] = temp_ref_idx; - - vector2d_t mv = { 0, 0 }; + vector2d_t best_mv = { 0, 0 }; // Take starting point for MV search from previous frame. // When temporal motion vector candidates are added, there is probably @@ -1319,8 +1302,7 @@ static void search_pu_inter_ref(inter_search_info_t *info, if (ref_cu->inter.mv_dir & 1) { mv_previous.x = ref_cu->inter.mv[0][0]; mv_previous.y = ref_cu->inter.mv[0][1]; - } - else { + } else { mv_previous.x = ref_cu->inter.mv[1][0]; mv_previous.y = ref_cu->inter.mv[1][1]; } @@ -1353,16 +1335,16 @@ static void search_pu_inter_ref(inter_search_info_t *info, info->state->frame->ref->pocs[neighbor_poc_index], info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[ info->state->frame->ref->ref_LXs[neighbor_poc_index] - [col_list] + [col_list] [ref_cu->inter.mv_ref[col_list]] ], &mv_previous - ); + ); } // Check if the mv is valid after scaling if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) { - mv = mv_previous; + best_mv = mv_previous; } } @@ -1375,102 +1357,90 @@ static void search_pu_inter_ref(inter_search_info_t *info, default: break; } - info->best_cost = UINT32_MAX; + double best_cost = MAX_DOUBLE; + double best_bits = MAX_INT; - switch (cfg->ime_algorithm) { - case UVG_IME_TZ: - tz_search(info, mv); - break; + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { - case UVG_IME_FULL64: - case UVG_IME_FULL32: - case UVG_IME_FULL16: - case UVG_IME_FULL8: - case UVG_IME_FULL: - search_mv_full(info, search_range, mv); - break; + switch (cfg->ime_algorithm) { + case UVG_IME_TZ: + tz_search(info, best_mv, &best_cost, &best_bits, &best_mv); + break; - case UVG_IME_DIA: - diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; + case UVG_IME_FULL64: + case UVG_IME_FULL32: + case UVG_IME_FULL16: + case UVG_IME_FULL8: + case UVG_IME_FULL: + search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv); + break; - default: - hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); - break; - } + case UVG_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; - if (cfg->fme_level > 0 && info->best_cost < *inter_cost) { - search_frac(info); - - } else if (info->best_cost < UINT32_MAX) { - // Recalculate inter cost with SATD. - info->best_cost = uvg_image_calc_satd( - info->state->tile->frame->source, - info->ref, - info->origin.x, - info->origin.y, - info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> INTERNAL_MV_PREC), - info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> INTERNAL_MV_PREC), - info->width, - info->height); - info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5); - } - - mv = info->best_mv; - - int merged = 0; - int merge_idx = 0; - // Check every candidate to find a match - for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (info->merge_cand[merge_idx].dir != 3 && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][0] == mv.x && - info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][1] == mv.y && - (uint32_t)info->state->frame->ref_LX[info->merge_cand[merge_idx].dir - 1][ - info->merge_cand[merge_idx].ref[info->merge_cand[merge_idx].dir - 1]] == info->ref_idx) - { - merged = 1; - break; + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; } } - // Only check when candidates are different - int cu_mv_cand = 0; - if (!merged) { - cu_mv_cand = - select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); + if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + best_cost = uvg_image_calc_satd( + info->state->tile->frame->source, + info->ref, + info->origin.x, + info->origin.y, + info->state->tile->offset_x + info->origin.x + (best_mv.x >> INTERNAL_MV_PREC), + info->state->tile->offset_y + info->origin.y + (best_mv.y >> INTERNAL_MV_PREC), + info->width, + info->height); + best_cost += best_bits * info->state->lambda_sqrt; } - if (info->best_cost < *inter_cost) { - // Map reference index to L0/L1 pictures - cur_cu->inter.mv_dir = ref_list+1; + double LX_cost[2] = { best_cost, best_cost }; + double LX_bits[2] = { best_bits, best_bits }; + + // Compute costs and add entries for both lists, if necessary + for (; ref_list < 2 && ref_list_active[ref_list]; ++ref_list) { + + LX_idx = ref_list_idx[ref_list]; uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing + LX_cost[ref_list] += extra_bits * info->state->lambda_sqrt; + LX_bits[ref_list] += extra_bits; - cur_cu->merged = merged; - cur_cu->merge_idx = merge_idx; - cur_cu->inter.mv_ref[ref_list] = LX_idx; - cur_cu->inter.mv[ref_list][0] = (mv_t)mv.x; - cur_cu->inter.mv[ref_list][1] = (mv_t)mv.y; + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { - CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand); - - *inter_cost = info->best_cost; - *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded; - } - - - // Update best unipreds for biprediction - if (info->best_cost < best_LX_cost[ref_list]) { - bool valid_mv = fracmv_within_tile(info, mv.x, mv.y); - if (valid_mv) { // Map reference index to L0/L1 pictures - unipred_LX[ref_list].inter.mv_dir = ref_list + 1; - unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx; - unipred_LX[ref_list].inter.mv[ref_list][0] = (mv_t)mv.x; - unipred_LX[ref_list].inter.mv[ref_list][1] = (mv_t)mv.y; + unit_stats_map_t *cur_map = &amvp[ref_list]; + int entry = cur_map->size; + cu_info_t *unipred_pu = &cur_map->unit[entry]; + *unipred_pu = *cur_cu; + unipred_pu->type = CU_INTER; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = ref_list + 1; + unipred_pu->inter.mv_ref[ref_list] = LX_idx; + unipred_pu->inter.mv[ref_list][0] = (mv_t)best_mv.x; + unipred_pu->inter.mv[ref_list][1] = (mv_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand); - CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand); - - best_LX_cost[ref_list] = info->best_cost; + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; + cur_map->keys[entry] = entry; + cur_map->size++; } } } @@ -1481,9 +1451,8 @@ static void search_pu_inter_ref(inter_search_info_t *info, */ static void search_pu_inter_bipred(inter_search_info_t *info, int depth, - lcu_t *lcu, cu_info_t *cur_cu, - double *inter_cost, - uint32_t *inter_bitcost) + lcu_t *lcu, + unit_stats_map_t *amvp_bipred) { const image_list_t *const ref = info->state->frame->ref; uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; @@ -1515,11 +1484,26 @@ static void search_pu_inter_bipred(inter_search_info_t *info, continue; } - mv_t mv[2][2]; + cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size]; + *bipred_pu = *LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + bipred_pu->inter.mv_dir = 3; + + bipred_pu->inter.mv_ref[0] = merge_cand[i].ref[0]; + bipred_pu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + int16_t(*mv)[2] = bipred_pu->inter.mv; mv[0][0] = merge_cand[i].mv[0][0]; mv[0][1] = merge_cand[i].mv[0][1]; mv[1][0] = merge_cand[j].mv[1][0]; mv[1][1] = merge_cand[j].mv[1][1]; + + bipred_pu->merged = false; + bipred_pu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + } // Don't try merge candidates that don't satisfy mv constraints. if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) || @@ -1541,10 +1525,10 @@ static void search_pu_inter_bipred(inter_search_info_t *info, const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride]; - uint32_t cost = + double cost = uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->stride); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; cost += info->mvd_cost_func(info->state, merge_cand[i].mv[0][0], @@ -1566,51 +1550,25 @@ static void search_pu_inter_bipred(inter_search_info_t *info, merge_cand[j].ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info->state->lambda_sqrt * extra_bits + 0.5; + cost += info->state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; - - cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; - cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; - cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; - cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = 0; - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].dir != 3) continue; - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } - } - - // Each motion vector has its own candidate - for (int reflist = 0; reflist < 2; reflist++) { - uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); - int cu_mv_cand = select_mv_cand( - info->state, - info->mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], - NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); - } - - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + // Each motion vector has its own candidate + for (int reflist = 0; reflist < 2; reflist++) { + int cu_mv_cand = select_mv_cand( + info->state, + info->mv_cand, + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], + NULL); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } + + bipred_pu->type = CU_INTER; + + amvp_bipred->cost[amvp_bipred->size] = cost; + amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits; + amvp_bipred->keys[amvp_bipred->size] = amvp_bipred->size; + amvp_bipred->size++; } } @@ -1624,14 +1582,14 @@ static void search_pu_inter_bipred(inter_search_info_t *info, * * \return Does an identical candidate exist in list */ -static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, - inter_merge_cand_t * cand_to_add, - int8_t * added_idx_list, - int list_size) +static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, + inter_merge_cand_t *cand_to_add, + unit_stats_map_t *merge) { bool found = false; - for (int i = 0; i < list_size && !found; ++i) { - inter_merge_cand_t * list_cand = &all_cands[added_idx_list[i]]; + for (int i = 0; i < merge->size && !found; ++i) { + int key = merge->keys[i]; + inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx]; found = cand_to_add->dir == list_cand->dir && cand_to_add->ref[0] == list_cand->ref[0] && @@ -1646,7 +1604,7 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, } /** - * \brief Update PU to have best modes at this depth. + * \brief Collect PU parameters and costs at this depth. * * \param state encoder state * \param x_cu x-coordinate of the containing CU @@ -1656,28 +1614,26 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands, * \param i_pu index of the PU in the CU * \param lcu containing LCU * - * \param inter_cost Return inter cost of the best mode - * \param inter_bitcost Return inter bitcost of the best mode + * \param amvp Return searched AMVP PUs sorted by costs + * \param merge Return searched Merge PUs sorted by costs */ static void search_pu_inter(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost) + int x_cu, int y_cu, + int depth, + part_mode_t part_mode, + int i_pu, + lcu_t *lcu, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + inter_search_info_t *info) { - *inter_cost = MAX_INT; - *inter_bitcost = MAX_INT; - const uvg_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); + const int width_cu = LCU_WIDTH >> depth; + const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); + const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); + const int width = PU_GET_W(part_mode, width_cu, i_pu); + const int height = PU_GET_H(part_mode, width_cu, i_pu); // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and // nRx2N partitions. @@ -1686,129 +1642,162 @@ static void search_pu_inter(encoder_state_t * const state, // 2NxnD partitions. const bool merge_b1 = i_pu == 0 || width <= height; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_NOTSET; + cur_pu->part_size = part_mode; + cur_pu->depth = depth; + cur_pu->qp = state->qp; - inter_search_info_t info = { - .state = state, - .pic = frame->source, - .origin = { x, y }, - .width = width, - .height = height, - .mvd_cost_func = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost, - .optimized_sad = uvg_get_optimized_sad(width), - }; + // Default to candidate 0 + CU_SET_MV_CAND(cur_pu, 0, 0); + CU_SET_MV_CAND(cur_pu, 1, 0); + + FILL(*info, 0); + + info->state = state; + info->pic = frame->source; + info->origin.x = x; + info->origin.y = y; + info->width = width; + info->height = height; + info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost; + info->optimized_sad = uvg_get_optimized_sad(width); // Search for merge mode candidates - info.num_merge_cand = uvg_inter_get_merge_cand( + info->num_merge_cand = uvg_inter_get_merge_cand( state, x, y, width, height, merge_a1, merge_b1, - info.merge_cand, + info->merge_cand, lcu ); - // Default to candidate 0 - CU_SET_MV_CAND(cur_cu, 0, 0); - CU_SET_MV_CAND(cur_cu, 1, 0); - // Merge Analysis starts here - int8_t mrg_cands[MRG_MAX_NUM_CANDS]; - double mrg_costs[MRG_MAX_NUM_CANDS]; + merge->size = 0; for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { - mrg_cands[i] = -1; - mrg_costs[i] = MAX_DOUBLE; + merge->keys[i] = -1; + merge->cost[i] = MAX_DOUBLE; } - int num_rdo_cands = 0; - + const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0); +#else + const double no_skip_flag = 0; +#endif // Check motion vector constraints and perform rough search - for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { - inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; + for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { - cur_cu->inter.mv_dir = cur_cand->dir; - cur_cu->inter.mv_ref[0] = cur_cand->ref[0]; - cur_cu->inter.mv_ref[1] = cur_cand->ref[1]; - cur_cu->inter.mv[0][0] = cur_cand->mv[0][0]; - cur_cu->inter.mv[0][1] = cur_cand->mv[0][1]; - cur_cu->inter.mv[1][0] = cur_cand->mv[1][0]; - cur_cu->inter.mv[1][1] = cur_cand->mv[1][1]; + inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx]; + cur_pu->inter.mv_dir = cur_cand->dir; + cur_pu->inter.mv_ref[0] = cur_cand->ref[0]; + cur_pu->inter.mv_ref[1] = cur_cand->ref[1]; + cur_pu->inter.mv[0][0] = cur_cand->mv[0][0]; + cur_pu->inter.mv[0][1] = cur_cand->mv[0][1]; + cur_pu->inter.mv[1][0] = cur_cand->mv[1][0]; + cur_pu->inter.mv[1][1] = cur_cand->mv[1][1]; // If bipred is not enabled, do not try candidates with mv_dir == 3. // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. - if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; - if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue; + if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; + if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; - bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand, - mrg_cands, - num_rdo_cands); + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); // Don't try merge candidates that don't satisfy mv constraints. // Don't add duplicates to list - if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || - !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) || + bool active_L0 = cur_pu->inter.mv_dir & 1; + bool active_L1 = cur_pu->inter.mv_dir & 2; + if ((active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) || + (active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])) || is_duplicate) { continue; } uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); - mrg_costs[num_rdo_cands] = uvg_satd_any_size(width, height, - lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, - lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); - - // Add cost of coding the merge index - mrg_costs[num_rdo_cands] += merge_idx * info.state->lambda_sqrt; + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_INTER; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; - mrg_cands[num_rdo_cands] = merge_idx; - num_rdo_cands++; + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + } + else { + merge->cost[merge->size] = uvg_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + bits += no_skip_flag; + merge->cost[merge->size] += bits * info->state->lambda_sqrt; + } + // Add cost of coding the merge index + merge->bits[merge->size] = bits; + merge->keys[merge->size] = merge->size; + + + merge->size++; } - // Sort candidates by cost - uvg_sort_modes(mrg_cands, mrg_costs, num_rdo_cands); + assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE); + uvg_sort_keys_by_cost(merge); - // Limit by availability - // TODO: Do not limit to just 1 - num_rdo_cands = MIN(1, num_rdo_cands); + // Try early skip decision on just one merge candidate if available + int num_rdo_cands = MIN(1, merge->size); // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) { - for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { - - // Reconstruct blocks with merge candidate. - // Check luma CBF. Then, check chroma CBFs if luma CBF is not set - // and chroma exists. - // Early terminate if merge candidate with zero CBF is found. - int merge_idx = mrg_cands[merge_rdo_idx]; - inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx]; - - cur_cu->inter.mv_dir = cur_cand->dir; - cur_cu->inter.mv_ref[0] = cur_cand->ref[0]; - cur_cu->inter.mv_ref[1] = cur_cand->ref[1]; - cur_cu->inter.mv[0][0] = cur_cand->mv[0][0]; - cur_cu->inter.mv[0][1] = cur_cand->mv[0][1]; - cur_cu->inter.mv[1][0] = cur_cand->mv[1][0]; - cur_cu->inter.mv[1][1] = cur_cand->mv[1][1]; - - uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); - uvg_inter_recon_cu(state, lcu, x, y, width, true, false); - uvg_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true); - - if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) { - continue; + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { + if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { + merge->size = 1; + merge->bits[0] = merge->bits[merge->keys[merge_key]]; + merge->cost[0] = merge->cost[merge->keys[merge_key]]; + merge->unit[0] = merge->unit[merge->keys[merge_key]]; + merge->keys[0] = 0; } - else if (has_chroma) { - uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); - uvg_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true); - if (!cbf_is_set_any(cur_cu->cbf, depth)) { - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->skipped = true; - *inter_cost = 0.0; // TODO: Check this - *inter_bitcost = merge_idx; // TODO: Check this - return; + else if(cfg->rdo < 2) { + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0]; + cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1]; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; + cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; + uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + uvg_inter_recon_cu(state, lcu, x, y, width, true, false); + uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true); + + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + continue; + } + else if (has_chroma) { + uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + uvg_quantize_lcu_residual(state, false, has_chroma, + false, /*we are only checking for lack of coeffs so no need to check jccr*/ + x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_INTER; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; + return; + } } } } @@ -1816,16 +1805,139 @@ static void search_pu_inter(encoder_state_t * const state, // AMVP search starts here - // Store unipred information of L0 and L1 for biprediction - // Best cost will be left at MAX_DOUBLE if no valid CU is found - double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE }; - cu_info_t unipreds[2]; + amvp[0].size = 0; + amvp[1].size = 0; + amvp[2].size = 0; + + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + for (int i = 0; i < state->frame->ref->used_size; ++i) { + amvp[mv_dir - 1].cost[i] = MAX_DOUBLE; + } + } for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { - info.ref_idx = ref_idx; - info.ref = state->frame->ref->images[ref_idx]; + info->ref_idx = ref_idx; + info->ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds); + search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); + } + + assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); + assert(amvp[1].size <= MAX_UNIT_STATS_MAP_SIZE); + uvg_sort_keys_by_cost(&amvp[0]); + uvg_sort_keys_by_cost(&amvp[1]); + + int best_keys[2] = { + amvp[0].size > 0 ? amvp[0].keys[0] : 0, + amvp[1].size > 0 ? amvp[1].keys[0] : 0 + }; + + cu_info_t *best_unipred[2] = { + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] + }; + + // Prevent using the same ref picture with both lists. + // TODO: allow searching two MVs from the same reference picture. + if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) { + + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int L0_idx = best_unipred[0]->inter.mv_ref[0]; + int L1_idx = best_unipred[1]->inter.mv_ref[1]; + + int L0_ref_idx = ref_LX[0][L0_idx]; + int L1_ref_idx = ref_LX[1][L1_idx]; + + if (L0_ref_idx == L1_ref_idx) { + // Invalidate the other based the list that has the 2nd best PU + double L0_2nd_cost = amvp[0].size > 1 ? amvp[0].cost[amvp[0].keys[1]] : MAX_DOUBLE; + double L1_2nd_cost = amvp[1].size > 1 ? amvp[1].cost[amvp[1].keys[1]] : MAX_DOUBLE; + int list = (L0_2nd_cost <= L1_2nd_cost) ? 1 : 0; + amvp[list].cost[best_keys[list]] = MAX_DOUBLE; + uvg_sort_keys_by_cost(&amvp[list]); + amvp[list].size--; + best_keys[list] = amvp[list].keys[0]; + best_unipred[list] = &amvp[list].unit[best_keys[list]]; + } + } + + // Fractional-pixel motion estimation. + // Refine the best PUs so far from both lists, if available. + for (int list = 0; list < 2; ++list) { + + // TODO: make configurable + int n_best = MIN(1, amvp[list].size); + if (cfg->fme_level > 0) { + + for (int i = 0; i < n_best; ++i) { + + int key = amvp[list].keys[i]; + cu_info_t *unipred_pu = &amvp[list].unit[key]; + + // Find the reference picture + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; + + int LX_idx = unipred_pu->inter.mv_ref[list]; + info->ref_idx = ref_LX[list][LX_idx]; + info->ref = ref->images[info->ref_idx]; + + uvg_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + unipred_pu, + lcu, + list); + + double frac_cost = MAX_DOUBLE; + double frac_bits = MAX_INT; + vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] }; + + search_frac(info, &frac_cost, &frac_bits, &frac_mv); + + uint8_t mv_ref_coded = LX_idx; + int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL); + const int extra_bits = list + mv_ref_coded; // TODO: check if mv_dir bits are missing + frac_cost += extra_bits * info->state->lambda_sqrt; + frac_bits += extra_bits; + + bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y); + if (valid_mv) { + + unipred_pu->inter.mv[list][0] = frac_mv.x; + unipred_pu->inter.mv[list][1] = frac_mv.y; + CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); + + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); + } + + amvp[list].cost[key] = frac_cost; + amvp[list].bits[key] = frac_bits; + } + } + + // Invalidate PUs with SAD-based costs. (FME not performed). + // TODO: Recalculate SAD costs with SATD for further processing. + for (int i = n_best; i < amvp[list].size; ++i) { + int key = amvp[list].keys[i]; + amvp[list].cost[key] = MAX_DOUBLE; + } + } + + // Costs are now, SATD-based. Omit PUs with SAD-based costs. + // TODO: Recalculate SAD costs with SATD for further processing. + uvg_sort_keys_by_cost(&amvp[list]); + amvp[list].size = n_best; + } + + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) { + if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); } // Search bi-pred positions @@ -1835,25 +1947,39 @@ static void search_pu_inter(encoder_state_t * const state, if (can_use_bipred) { + cu_info_t *bipred_pu = &amvp[2].unit[0]; + *bipred_pu = *cur_pu; + double best_bipred_cost = MAX_DOUBLE; + // Try biprediction from valid acquired unipreds. - if (best_cost_LX[0] != MAX_DOUBLE && best_cost_LX[1] != MAX_DOUBLE) { + if (amvp[0].size > 0 && amvp[1].size > 0) { // TODO: logic is copy paste from search_pu_inter_bipred. // Get rid of duplicate code asap. - const image_list_t *const ref = info.state->frame->ref; - uint8_t(*ref_LX)[16] = info.state->frame->ref_LX; + const image_list_t *const ref = info->state->frame->ref; + uint8_t(*ref_LX)[16] = info->state->frame->ref_LX; - inter_merge_cand_t *merge_cand = info.merge_cand; + bipred_pu->inter.mv_dir = 3; - mv_t mv[2][2]; - mv[0][0] = unipreds[0].inter.mv[0][0]; - mv[0][1] = unipreds[0].inter.mv[0][1]; - mv[1][0] = unipreds[1].inter.mv[1][0]; - mv[1][1] = unipreds[1].inter.mv[1][1]; + bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0]; + bipred_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1]; - uvg_inter_recon_bipred(info.state, - ref->images[ref_LX[0][unipreds[0].inter.mv_ref[0]]], - ref->images[ref_LX[1][unipreds[1].inter.mv_ref[1]]], + int16_t (*mv)[2] = bipred_pu->inter.mv; + mv[0][0] = best_unipred[0]->inter.mv[0][0]; + mv[0][1] = best_unipred[0]->inter.mv[0][1]; + mv[1][0] = best_unipred[1]->inter.mv[1][0]; + mv[1][1] = best_unipred[1]->inter.mv[1][1]; + + bipred_pu->merged = false; + bipred_pu->skipped = false; + + for (int reflist = 0; reflist < 2; reflist++) { + uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + } + + uvg_inter_recon_bipred(info->state, + ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], + ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], x, y, width, height, @@ -1864,104 +1990,77 @@ static void search_pu_inter(encoder_state_t * const state, const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - uint32_t cost = + + best_bipred_cost = uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); - uint32_t bitcost[2] = { 0, 0 }; + double bitcost[2] = { 0, 0 }; - cost += info.mvd_cost_func(info.state, - unipreds[0].inter.mv[0][0], - unipreds[0].inter.mv[0][1], + best_bipred_cost += info->mvd_cost_func(info->state, + bipred_pu->inter.mv[0][0], + bipred_pu->inter.mv[0][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[0]); - cost += info.mvd_cost_func(info.state, - unipreds[1].inter.mv[1][0], - unipreds[1].inter.mv[1][1], + best_bipred_cost += info->mvd_cost_func(info->state, + bipred_pu->inter.mv[1][0], + bipred_pu->inter.mv[1][1], 0, - info.mv_cand, + info->mv_cand, NULL, 0, 0, &bitcost[1]); const uint8_t mv_ref_coded[2] = { - unipreds[0].inter.mv_ref[0], - unipreds[1].inter.mv_ref[1] + bipred_pu->inter.mv_ref[0], + bipred_pu->inter.mv_ref[1] }; const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += info.state->lambda_sqrt * extra_bits + 0.5; + best_bipred_cost += info->state->lambda_sqrt * extra_bits; - if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = unipreds[0].inter.mv_ref[0]; - cur_cu->inter.mv_ref[1] = unipreds[1].inter.mv_ref[1]; - - cur_cu->inter.mv[0][0] = unipreds[0].inter.mv[0][0]; - cur_cu->inter.mv[0][1] = unipreds[0].inter.mv[0][1]; - cur_cu->inter.mv[1][0] = unipreds[1].inter.mv[1][0]; - cur_cu->inter.mv[1][1] = unipreds[1].inter.mv[1][1]; - cur_cu->merged = 0; - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].dir != 3) continue; - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } - } + if (best_bipred_cost < MAX_DOUBLE) { // Each motion vector has its own candidate for (int reflist = 0; reflist < 2; reflist++) { - uvg_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist); int cu_mv_cand = select_mv_cand( - info.state, - info.mv_cand, - cur_cu->inter.mv[reflist][0], - cur_cu->inter.mv[reflist][1], + info->state, + info->mv_cand, + bipred_pu->inter.mv[reflist][0], + bipred_pu->inter.mv[reflist][1], NULL); - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); + CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand); } - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].cost[amvp[2].size] = best_bipred_cost; + amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits; + amvp[2].keys[amvp[2].size] = amvp[2].size; + amvp[2].size++; } } // TODO: this probably should have a separate command line option - if (cfg->rdo >= 3) { - search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost); + if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); + + assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); + uvg_sort_keys_by_cost(&amvp[2]); + if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); } } + if(cfg->rdo < 2) { + int predmode_ctx; + const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); - // Compare best merge cost to amvp cost - if (mrg_costs[0] < *inter_cost) { - *inter_cost = mrg_costs[0]; - *inter_bitcost = 0; // TODO: Check this - int merge_idx = mrg_cands[0]; - cur_cu->type = CU_INTER; - cur_cu->merge_idx = merge_idx; - cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; - cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; - cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; - cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; - cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; - cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; - cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; - cur_cu->merged = true; - cur_cu->skipped = false; - } - - if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { - assert(fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1])); + const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); + const double total_bits = no_skip_flag + pred_mode_bits; + for(int i = 0; i < 3; i++) { + if(amvp[i].size > 0) { + const uint8_t best_key = amvp[i].keys[0]; + amvp[i].bits[best_key] += total_bits; + amvp[i].cost[best_key] += (total_bits)* state->lambda_sqrt; + } + } } } @@ -1985,32 +2084,99 @@ static void search_pu_inter(encoder_state_t * const state, * \param inter_bitcost Return inter bitcost */ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - uint32_t *inter_bitcost){ - - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + int x, int y, int depth, + cu_info_t* cur_cu, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost){ + int tr_depth = MAX(1, depth); if (cur_cu->part_size != SIZE_2Nx2N) { tr_depth = depth + 1; } uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth); + const int x_px = SUB_SCU(x); + const int y_px = SUB_SCU(y); + const int width = LCU_WIDTH >> depth; + cabac_data_t cabac_copy; + memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); + cabac_copy.update = 1; + + cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + *cur_pu = *cur_cu; + const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400; uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); - uvg_quantize_lcu_residual(state, true, reconstruct_chroma, - x, y, depth, - NULL, - lcu, - false); - *inter_cost = uvg_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + int index = y_px * LCU_WIDTH + x_px; + double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + width) * UVG_LUMA_MULT; if (reconstruct_chroma) { - *inter_cost += uvg_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; + double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width / 2); + double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], + LCU_WIDTH_C, LCU_WIDTH_C, + width / 2); + ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT; } + double no_cbf_bits; + double bits = 0; + const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL); + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost; + bits += uvg_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); + } + else { + no_cbf_bits = uvg_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu); + bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 1); + } + double no_cbf_cost = ssd + no_cbf_bits * state->lambda; - *inter_cost += *inter_bitcost * state->lambda; + uvg_quantize_lcu_residual(state, + true, reconstruct_chroma, + reconstruct_chroma && state->encoder_control->cfg.jccr, x, y, + depth, + cur_cu, + lcu, + false); + + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + + if(cbf) { + *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); + if (reconstruct_chroma) { + if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) { + *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); + } + else { + uvg_select_jccr_mode(state, x_px, y_px, depth, cur_cu, lcu, inter_cost); + } + } + } + else { + // If we have no coeffs after quant we already have the cost calculated + *inter_cost = no_cbf_cost; + cur_cu->cbf = 0; + *inter_bitcost = no_cbf_bits; + return; + } + + *inter_cost += (bits)* state->lambda; + *inter_bitcost = bits; + + if(no_cbf_cost < *inter_cost) { + cur_cu->cbf = 0; + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->skipped = 1; + } + *inter_cost = no_cbf_cost; + *inter_bitcost = no_cbf_bits; + + } } @@ -2032,21 +2198,79 @@ void uvg_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost) + double* inter_bitcost) { + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + inter_search_info_t info; + search_pu_inter(state, x, y, depth, SIZE_2Nx2N, 0, lcu, - inter_cost, - inter_bitcost); + amvp, + &merge, + &info); - // Calculate more accurate cost when needed - if (state->encoder_control->cfg.rdo >= 2) { - uvg_cu_cost_inter_rd2(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); + // Early Skip CU decision + if (merge.size == 1 && merge.unit[0].skipped) { + *inter_cost = merge.cost[0]; + *inter_bitcost = merge.bits[0]; + return; + } + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this + } + + if (*inter_cost == MAX_DOUBLE) { + // Could not find any motion vector. + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + return; + } + + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + *cur_pu = *best_inter_pu; + + uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), + true, state->encoder_control->chroma_format != UVG_CSP_400); + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } + + if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) { + assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])); } } diff --git a/src/search_inter.h b/src/search_inter.h index d1e1ee71..d76dd927 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -64,20 +64,34 @@ enum hpel_position { HPEL_POS_DIA = 2 }; -typedef uint32_t uvg_mvd_cost_func(const encoder_state_t *state, +typedef double uvg_mvd_cost_func(const encoder_state_t *state, int x, int y, int mv_shift, mv_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, - uint32_t *bitcost); + double *bitcost); void uvg_search_cu_inter(encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu, double *inter_cost, - uint32_t *inter_bitcost); + double* inter_bitcost); + +unsigned uvg_inter_satd_cost(const encoder_state_t* state, + const lcu_t *lcu, + int x, + int y); +void uvg_cu_cost_inter_rd2(encoder_state_t* const state, + int x, int y, int depth, + cu_info_t* cur_cu, + lcu_t* lcu, + double* inter_cost, + double* inter_bitcost); + +int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx); + #endif // SEARCH_INTER_H_ diff --git a/src/search_intra.c b/src/search_intra.c index e89720fb..9dc24fba 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -37,6 +37,7 @@ #include "cabac.h" #include "encoder.h" #include "encoderstate.h" +#include "encode_coding_tree.h" #include "image.h" #include "intra.h" #include "uvg266.h" @@ -97,13 +98,13 @@ static double get_cost(encoder_state_t * const state, // Add the offset bit costs of signaling 'luma and chroma use trskip', // versus signaling 'luma and chroma don't use trskip' to the SAD cost. - const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma; + const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma; double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0); // ToDo: Check cost if (state->encoder_control->chroma_format != UVG_CSP_400) { - ctx = &state->cabac.ctx.transform_skip_model_chroma; + ctx = &state->search_cabac.ctx.transform_skip_model_chroma; trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); } @@ -253,13 +254,15 @@ static void derive_mts_constraints(cu_info_t *const pred_cu, * \param cost_treshold RD cost at which search can be stopped. * \param mts_mode Selected MTS mode for current intra mode. */ -static double search_intra_trdepth(encoder_state_t * const state, - int x_px, int y_px, int depth, int max_depth, - int intra_mode, int cost_treshold, - cu_info_t *const pred_cu, - lcu_t *const lcu, - cclm_parameters_t *cclm_params, - const int mts_mode) +static double search_intra_trdepth( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + int max_depth, + int cost_treshold, + intra_search_data_t *const search_data, + lcu_t *const lcu) { assert(depth >= 0 && depth <= MAX_PU_DEPTH); @@ -268,9 +271,10 @@ static double search_intra_trdepth(encoder_state_t * const state, const int offset = width / 2; const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; - cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); const bool reconstruct_chroma = (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != UVG_CSP_400; + cu_info_t* pred_cu = &search_data->pred_cu; + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); struct { uvg_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; @@ -295,16 +299,16 @@ static double search_intra_trdepth(encoder_state_t * const state, cbf_clear(&pred_cu->cbf, depth, COLOR_V); } - const int8_t chroma_mode = reconstruct_chroma ? intra_mode : -1; + const int8_t chroma_mode = reconstruct_chroma ? pred_cu->intra.mode : -1; double best_rd_cost = MAX_INT; int best_tr_idx = 0; int trafo; int num_transforms = 1; - if (mts_mode != -1) + if (pred_cu->tr_idx != MTS_TR_NUM) { - trafo = mts_mode; - num_transforms = mts_mode + 1; + trafo = pred_cu->tr_idx; + num_transforms = pred_cu->tr_idx + 1; } else { @@ -315,6 +319,8 @@ static double search_intra_trdepth(encoder_state_t * const state, if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) { num_transforms = MAX(num_transforms, 2); } + pred_cu->intra.mode_chroma = -1; + pred_cu->joint_cb_cr = 4; for (; trafo < num_transforms; trafo++) { pred_cu->tr_idx = trafo; if (mts_enabled) @@ -330,12 +336,10 @@ static double search_intra_trdepth(encoder_state_t * const state, } uvg_intra_recon_cu(state, - x_px, y_px, - depth, - intra_mode, -1, - pred_cu, cclm_params, pred_cu->intra.multi_ref_idx, - pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed, - lcu); + x_px, y_px, + depth, search_data, + pred_cu, + lcu); // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1 if (pred_cu->tr_idx > 1) @@ -343,7 +347,6 @@ static double search_intra_trdepth(encoder_state_t * const state, derive_mts_constraints(pred_cu, lcu, depth, lcu_px); if (pred_cu->violates_mts_coeff_constraint || !pred_cu->mts_last_scan_pos) { - assert(mts_mode == -1); //mts mode should not be decided and then not allowed to be used. (might be some exception here) continue; } } @@ -359,14 +362,17 @@ static double search_intra_trdepth(encoder_state_t * const state, } } if(reconstruct_chroma) { + int8_t luma_mode = pred_cu->intra.mode; + pred_cu->intra.mode = -1; + pred_cu->intra.mode_chroma = chroma_mode; + pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently uvg_intra_recon_cu(state, - x_px, y_px, - depth, - -1, chroma_mode, - pred_cu, cclm_params, 0, - pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed, - lcu); + x_px, y_px, + depth, search_data, + pred_cu, + lcu); best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + pred_cu->intra.mode = luma_mode; } pred_cu->tr_skip = best_tr_idx == MTS_SKIP; pred_cu->tr_idx = best_tr_idx; @@ -394,17 +400,17 @@ static double search_intra_trdepth(encoder_state_t * const state, // max_depth. // - Min transform size hasn't been reached (MAX_PU_DEPTH). if (depth < max_depth && depth < MAX_PU_DEPTH) { - split_cost = 3 * state->lambda; + split_cost = 0; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu); if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu); } if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1); + split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu); } double cbf_bits = 0.0; @@ -417,14 +423,15 @@ static double search_intra_trdepth(encoder_state_t * const state, // so this will code cbf as 0 and not code the cbf at all for descendants. if (state->encoder_control->chroma_format != UVG_CSP_400) { const uint8_t tr_depth = depth - pred_cu->depth; + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; - const cabac_ctx_t* ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]); + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U)); + CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); } ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V)); + CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); } } @@ -452,29 +459,44 @@ static double search_intra_trdepth(encoder_state_t * const state, return nosplit_cost; } } +static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length) +{ + // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about + // 60% of the time, so there should be no need for anything more complex + // than insertion sort. + // Length for merge is 5 or less. + for (uint8_t i = 1; i < length; ++i) { + const intra_search_data_t cur_cost = modes[i]; + uint8_t j = i; + while (j > 0 && cur_cost.cost < modes[j - 1].cost) { + modes[j] = modes[j - 1]; + --j; + } + modes[j] = cur_cost; + } +} - -static void search_intra_chroma_rough(encoder_state_t * const state, - int x_px, int y_px, int depth, - const uvg_pixel *orig_u, const uvg_pixel *orig_v, int16_t origstride, - uvg_intra_references *refs_u, uvg_intra_references *refs_v, - int8_t luma_mode, - int8_t modes[8], double costs[8], lcu_t* lcu) +static void search_intra_chroma_rough( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + const uvg_pixel *orig_u, + const uvg_pixel *orig_v, + int16_t origstride, + uvg_intra_references *refs_u, + uvg_intra_references *refs_v, + intra_search_data_t* chroma_data, + lcu_t* lcu) { assert(!(x_px & 4 || y_px & 4)); const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); - const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - (depth + 1), 2); - - for (int i = 0; i < 8; ++i) { - costs[i] = 0; - } cost_pixel_nxn_func *const satd_func = uvg_pixels_get_satd_func(width); //cost_pixel_nxn_func *const sad_func = uvg_pixels_get_sad_func(width); - - cclm_parameters_t cclm_params; - + cu_loc_t loc = { x_px, y_px, width, width, width, width }; + uvg_pixel _pred[32 * 32 + SIMD_ALIGNMENT]; uvg_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT); @@ -482,34 +504,27 @@ static void search_intra_chroma_rough(encoder_state_t * const state, uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); uvg_pixels_blit(orig_u, orig_block, width, width, origstride, width); - for (int i = 0; i < 5; ++i) { - if (modes[i] == -1) continue; - uvg_intra_predict(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false, 0); + int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5); + for (int i = 0; i < modes_count; ++i) { + if (chroma_data[i].pred_cu.intra.mode_chroma == -1) continue; + uvg_intra_predict(state, refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); - costs[i] += satd_func(pred, orig_block); - } - for (int i = 5; i < 8; i++) { - assert(state->encoder_control->cfg.cclm); - uvg_predict_cclm( - state, - COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params); + chroma_data[i].cost += satd_func(pred, orig_block); } uvg_pixels_blit(orig_v, orig_block, width, width, origstride, width); - for (int i = 0; i < 5; ++i) { - if (modes[i] == -1) continue; - uvg_intra_predict(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false, 0); + for (int i = 0; i < modes_count; ++i) { + if (chroma_data[i].pred_cu.intra.mode_chroma == -1) continue; + uvg_intra_predict(state, refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); - costs[i] += satd_func(pred, orig_block); - } - for (int i = 5; i < 8; i++) { - assert(state->encoder_control->cfg.cclm); - uvg_predict_cclm( - state, - COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params); + chroma_data[i].cost += satd_func(pred, orig_block); } - uvg_sort_modes(modes, costs, 5); + for (int i = 0; i < modes_count; ++i) { + const double bits = uvg_chroma_mode_bits(state, chroma_data[i].pred_cu.intra.mode_chroma, chroma_data[i].pred_cu.intra.mode); + chroma_data[i].bits = bits; + chroma_data[i].cost = bits * state->lambda_sqrt; + } } @@ -543,11 +558,16 @@ static void search_intra_chroma_rough(encoder_state_t * const state, * * \return Number of prediction modes in param modes. */ -static int8_t search_intra_rough(encoder_state_t * const state, - uvg_pixel *orig, int32_t origstride, - uvg_intra_references *refs, - int log2_width, int8_t *intra_preds, - int8_t modes[67], double costs[67]) +static int16_t search_intra_rough( + encoder_state_t * const state, + uvg_pixel *orig, + int32_t origstride, + uvg_intra_references *refs, + int log2_width, + int8_t *intra_preds, + intra_search_data_t* modes_out, + cu_info_t* const pred_cu, + uint8_t mip_ctx) { #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future? assert(log2_width >= 2 && log2_width <= 5); @@ -556,9 +576,11 @@ static int8_t search_intra_rough(encoder_state_t * const state, cost_pixel_nxn_func *sad_func = uvg_pixels_get_sad_func(width); cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width); cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width); + int8_t modes[UVG_NUM_INTRA_MODES]; + double costs[UVG_NUM_INTRA_MODES]; - const uvg_config *cfg = &state->encoder_control->cfg; - const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm); + // const uvg_config *cfg = &state->encoder_control->cfg; + // const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm); // Temporary block arrays uvg_pixel _preds[PARALLEL_BLKS * 32 * 32 + SIMD_ALIGNMENT]; @@ -587,12 +609,18 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Calculate SAD for evenly spaced modes to select the starting point for // the recursive search. + cu_loc_t loc = { 0, 0, width, width, width, width }; + intra_search_data_t search_proxy; + FILL(search_proxy, 0); + search_proxy.pred_cu = *pred_cu; + for (int mode = 2; mode <= 66; mode += PARALLEL_BLKS * offset) { double costs_out[PARALLEL_BLKS] = { 0 }; for (int i = 0; i < PARALLEL_BLKS; ++i) { if (mode + i * offset <= 66) { - uvg_intra_predict(state, refs, log2_width, mode + i * offset, COLOR_Y, preds[i], filter_boundary, 0); + search_proxy.pred_cu.intra.mode = mode + i*offset; + uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL); } } @@ -631,7 +659,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, if (mode_in_range) { for (int i = 0; i < PARALLEL_BLKS; ++i) { if (test_modes[i] >= 2 && test_modes[i] <= 66) { - uvg_intra_predict(state, refs, log2_width, test_modes[i], COLOR_Y, preds[i], filter_boundary, 0); + search_proxy.pred_cu.intra.mode = test_modes[i]; + uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL); } } @@ -653,10 +682,10 @@ static int8_t search_intra_rough(encoder_state_t * const state, } } - int8_t add_modes[5] = {intra_preds[0], intra_preds[1], intra_preds[2], 0, 1}; + int8_t add_modes[INTRA_MPM_COUNT + 2] = {intra_preds[0], intra_preds[1], intra_preds[2], intra_preds[3], intra_preds[4], intra_preds[5], 0, 1}; // Add DC, planar and missing predicted modes. - for (int8_t pred_i = 0; pred_i < 5; ++pred_i) { + for (int8_t pred_i = 0; pred_i < (INTRA_MPM_COUNT + 2); ++pred_i) { bool has_mode = false; int8_t mode = add_modes[pred_i]; @@ -668,7 +697,8 @@ static int8_t search_intra_rough(encoder_state_t * const state, } if (!has_mode) { - uvg_intra_predict(state, refs, log2_width, mode, COLOR_Y, preds[0], filter_boundary, 0); + search_proxy.pred_cu.intra.mode = mode; + uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL); costs[modes_selected] = get_cost(state, preds[0], orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; @@ -677,16 +707,106 @@ static int8_t search_intra_rough(encoder_state_t * const state, // Add prediction mode coding cost as the last thing. We don't want this // affecting the halving search. - int lambda_cost = (int)(state->lambda_sqrt + 0.5); + const double not_mrl = state->encoder_control->cfg.mrl ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 0) : 0; + const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0; + const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1); + const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0); + const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1); + const double not_planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 0); for (int mode_i = 0; mode_i < modes_selected; ++mode_i) { - costs[mode_i] += lambda_cost * uvg_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0); + int i = 0; + int smaller_than_pred = 0; + double bits; + for (; i < INTRA_MPM_COUNT; i++) { + if (intra_preds[i] == modes[mode_i]) { + break; + } + if (modes[mode_i] > intra_preds[i]) { + smaller_than_pred += 1; + } + } + if (i == 0) { + bits = planar_mode_flag + mpm_mode_bit; + } + else if (i < INTRA_MPM_COUNT) { + bits = not_planar_mode_flag + mpm_mode_bit + MAX(i, 4); + } + else { + bits = not_mpm_mode_bit + 5 + (modes[mode_i] - smaller_than_pred > 3); + } + bits += not_mrl + not_mip; + costs[mode_i] += state->lambda_sqrt * bits; + modes_out[mode_i].cost = costs[mode_i]; + modes_out[mode_i].pred_cu = *pred_cu; + modes_out[mode_i].pred_cu.intra.mode = modes[mode_i]; + modes_out[mode_i].pred_cu.intra.mode_chroma = modes[mode_i]; } #undef PARALLEL_BLKS - return modes_selected; } + +static void get_rough_cost_for_2n_modes( + encoder_state_t* const state, + uvg_intra_references* refs, + const cu_loc_t* const cu_loc, + uvg_pixel *orig, + int orig_stride, + intra_search_data_t *search_data, + int num_modes, + uint8_t mip_ctx) +{ +#define PARALLEL_BLKS 2 + assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_2n_modes"); + const int width = cu_loc->width; + cost_pixel_nxn_multi_func* satd_dual_func = uvg_pixels_get_satd_dual_func(width); + cost_pixel_nxn_multi_func* sad_dual_func = uvg_pixels_get_sad_dual_func(width); + + uvg_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT]; + pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT); + + uvg_pixel _orig_block[MIN(LCU_WIDTH, 64) * MIN(LCU_WIDTH, 64) + SIMD_ALIGNMENT]; + uvg_pixel* orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); + + uvg_pixels_blit(orig, orig_block, width, width, orig_stride, width); + + const double mrl = state->encoder_control->cfg.mrl ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 1) : 0; + const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0; + const double mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 1) : 0; + double costs_out[PARALLEL_BLKS] = { 0 }; + double bits[PARALLEL_BLKS] = { 0 }; + for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) { + for (int i = 0; i < PARALLEL_BLKS; ++i) { + uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL); + } + get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); + + for(int i = 0; i < PARALLEL_BLKS; ++i) { + uint8_t multi_ref_idx = search_data[mode + i].pred_cu.intra.multi_ref_idx; + if(multi_ref_idx) { + bits[i] = mrl + not_mip; + bits[i] += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[1]), multi_ref_idx != 1); + bits[i] += MIN((mode + i + 1) % 6, 4); + } + else if(search_data[mode + i].pred_cu.intra.mip_flag) { + bits[i] = mip + 1; + bits[i] += num_modes == 32 ? 4 : (num_modes == 16 ? 3 : (((mode + i) % 6) < 2 ? 2 : 3)); + } + else { + assert(0 && "get_rough_cost_for_2n_modes supports only mrl and mip mode cost calculation"); + } + } + search_data[mode].cost = costs_out[0]; + search_data[mode + 1].cost = costs_out[1]; + + search_data[mode].cost += bits[0] * state->lambda_sqrt; + search_data[mode + 1].cost += bits[1] * state->lambda_sqrt; + } +#undef PARALLEL_BLKS +} + + /** * \brief Find best intra mode out of the ones listed in parameter modes. * @@ -713,224 +833,57 @@ static int8_t search_intra_rough(encoder_state_t * const state, * \param[out] lcu If transform split searching is used, the transform split * information for the best mode is saved in lcu.cu structure. */ -static int8_t search_intra_rdo(encoder_state_t * const state, - int x_px, int y_px, int depth, - uvg_pixel *orig, int32_t origstride, - int8_t *intra_preds, - int modes_to_check, - int8_t modes[67], int8_t trafo[67], double costs[67], - int num_mip_modes_full, - int8_t mip_modes[32], int8_t mip_trafo[32], double mip_costs[32], - lcu_t *lcu, - uint8_t multi_ref_idx) +static int8_t search_intra_rdo( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + int modes_to_check, + intra_search_data_t *search_data, + lcu_t *lcu) { const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra); - const int width = LCU_WIDTH >> depth; - const int height = width; // TODO: proper height for non-square blocks - - uvg_pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1]; - - uvg_pixels_blit(orig, orig_block, width, height, origstride, width); - - // Check that the predicted modes are in the RDO mode list - if (modes_to_check < 67) { - int pred_mode = 0; - // Skip planar if searching modes for MRL - if (multi_ref_idx != 0) { - pred_mode = 1; - } - for (; pred_mode < 6; pred_mode++) { - int mode_found = 0; - for (int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode++) { - if (intra_preds[pred_mode] == modes[rdo_mode]) { - mode_found = 1; - break; - } - } - // Add this prediction mode to RDO checking - if (!mode_found) { - modes[modes_to_check] = intra_preds[pred_mode]; - modes_to_check++; - } - } - } - - // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad. - // MIP_TODO: loop through normal intra modes first - for (int mip = 0; mip <= 1; mip++) { - const int transp_off = mip ? num_mip_modes_full >> 1 : 0; - uint8_t ctx_id = mip ? uvg_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL) : 0; - uint8_t multi_ref_index = mip ? 0 : multi_ref_idx; - int *num_modes = mip ? &num_mip_modes_full : &modes_to_check; + for (int mode = 0; mode < modes_to_check; mode++) { + double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu); + search_data[mode].pred_cu.tr_idx = MTS_TR_NUM; + search_data[mode].bits = rdo_bitcost; + search_data[mode].cost = rdo_bitcost * state->lambda; - for (uint8_t i = 0; i < *num_modes; i++) { - int8_t mode = mip ? mip_modes[i] : modes[i]; - double *mode_cost_p = mip ? &mip_costs[i] : &costs[i]; - int8_t *mode_trafo_p = mip ? &mip_trafo[i] : &trafo[i]; - int rdo_bitcost = uvg_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id); - - *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5); - - // Mip related stuff - // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream. - // Half of the modes [16, 31] are indicated with the separate transpose flag. - // Number of possible modes is less for larger blocks. - const bool is_transposed = mip ? (mode >= transp_off ? true : false) : 0; - int8_t pred_mode = (is_transposed ? mode - transp_off : mode); - - // Perform transform split search and save mode RD cost for the best one. - cu_info_t pred_cu; - pred_cu.depth = depth; - pred_cu.type = CU_INTRA; - pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks - pred_cu.intra.mode = pred_mode; - pred_cu.intra.mode_chroma = pred_mode; - pred_cu.intra.multi_ref_idx = multi_ref_index; - pred_cu.intra.mip_is_transposed = is_transposed; - pred_cu.intra.mip_flag = mip ? true : false; - pred_cu.joint_cb_cr = 0; - FILL(pred_cu.cbf, 0); - - // Reset transform split data in lcu.cu for this area. - uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); - - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_mode, MAX_INT, &pred_cu, lcu, NULL, -1); - *mode_cost_p += mode_cost; - *mode_trafo_p = pred_cu.tr_idx; - - // Early termination if no coefficients has to be coded - if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) { - *num_modes = i + 1; - break; - } + double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu); + search_data[mode].cost += mode_cost; + if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) { + modes_to_check = mode + 1; + break; } } // Update order according to new costs - uvg_sort_modes_intra_luma(modes, trafo, costs, modes_to_check); - bool use_mip = false; - if (num_mip_modes_full) { - uvg_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes_full); - if (costs[0] > mip_costs[0]) { - use_mip = true; + double best_cost = MAX_INT; + int best_mode = 0; + for (int mode = 0; mode < modes_to_check; mode++) { + if(search_data[mode].cost < best_cost) { + best_cost = search_data[mode].cost; + best_mode = mode; } } - + search_data[0] = search_data[best_mode]; - // The best transform split hierarchy is not saved anywhere, so to get the - // transform split hierarchy the search has to be performed again with the - // best mode. - if (tr_depth != depth) { - cu_info_t pred_cu; - pred_cu.depth = depth; - pred_cu.type = CU_INTRA; - pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); - if (use_mip) { - int transp_off = num_mip_modes_full >> 1; - bool is_transposed = (mip_modes[0] >= transp_off ? true : false); - int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]); - pred_cu.intra.mode = pred_mode; - pred_cu.intra.mode_chroma = pred_mode; - pred_cu.intra.multi_ref_idx = 0; - pred_cu.intra.mip_flag = true; - pred_cu.intra.mip_is_transposed = is_transposed; - } - else { - pred_cu.intra.mode = modes[0]; - pred_cu.intra.mode_chroma = modes[0]; - pred_cu.intra.multi_ref_idx = multi_ref_idx; - pred_cu.intra.mip_flag = false; - pred_cu.intra.mip_is_transposed = false; - } - FILL(pred_cu.cbf, 0); - search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]); - } - - // TODO: modes to check does not consider mip modes. Maybe replace with array when mip search is optimized? return modes_to_check; } -double uvg_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id) +double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu) { - double mode_bits = 0.0; - - bool enable_mip = state->encoder_control->cfg.mip; - bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false; - - // Mip flag cost must be calculated even if mip is not used in this block - if (enable_mip) { - // Make a copy of state->cabac for bit cost estimation. - cabac_data_t state_cabac_copy; - cabac_data_t* cabac; - memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t)); - // Clear data and set mode to count only - state_cabac_copy.only_count = 1; - state_cabac_copy.num_buffered_bytes = 0; - state_cabac_copy.bits_left = 23; - - cabac = &state_cabac_copy; - - // Do cabac writes as normal - const int transp_off = num_mip_modes_half; - const bool is_transposed = luma_mode >= transp_off ? true : false; - int8_t mip_mode = is_transposed ? luma_mode - transp_off : luma_mode; - - // Write MIP flag - cabac->cur_ctx = &(cabac->ctx.mip_flag[mip_flag_ctx_id]); - CABAC_BIN(cabac, mip_flag, "mip_flag"); - - if (mip_flag) { - // Write MIP transpose flag & mode - CABAC_BIN_EP(cabac, is_transposed, "mip_transposed"); - uvg_cabac_encode_trunc_bin(cabac, mip_mode, transp_off); - } - - // Write is done. Get bit cost out of cabac - mode_bits += (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); - } - - if (!mip_flag) { - int8_t mode_in_preds = -1; - for (int i = 0; i < INTRA_MPM_COUNT; ++i) { - if (luma_mode == intra_preds[i]) { - mode_in_preds = i; - break; - } - } - - bool enable_mrl = state->encoder_control->cfg.mrl; - uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0; - - const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model); - - if (multi_ref_index == 0) { - mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1); - } - - // Add MRL bits. - if (enable_mrl && MAX_REF_LINE_IDX > 1) { - ctx = &(state->cabac.ctx.multi_ref_line[0]); - mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0); - - if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) { - ctx = &(state->cabac.ctx.multi_ref_line[1]); - mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1); - } - } - - if (mode_in_preds != -1 || multi_ref_index != 0) { - ctx = &(state->cabac.ctx.luma_planar_model[0]); - if (multi_ref_index == 0) { - mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0); - } - mode_bits += MIN(4.0, mode_in_preds); - } - else { - mode_bits += 6.0; - } - } + cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; + double mode_bits = 0; + cabac_data_t cabac_copy; + memcpy(&cabac_copy, cabac, sizeof cabac_copy); + uvg_encode_intra_luma_coding_unit( + state, + &cabac_copy, cur_cu, + x, y, depth, lcu, &mode_bits + ); return mode_bits; } @@ -938,7 +891,8 @@ double uvg_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode) { - const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model); + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model); double mode_bits; if (chroma_mode == luma_mode) { mode_bits = CTX_ENTROPY_FBITS(ctx, 0); @@ -958,15 +912,26 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67); } + if(cabac->update) { + if(chroma_mode != luma_mode) { + // Again it does not matter what we actually write here + CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode"); + } + } + return mode_bits; } -int8_t uvg_search_intra_chroma_rdo(encoder_state_t * const state, - int x_px, int y_px, int depth, - int8_t intra_mode, - int8_t modes[8], int8_t num_modes, - lcu_t *const lcu, cclm_parameters_t *best_cclm) +int8_t uvg_search_intra_chroma_rdo( + encoder_state_t * const state, + int x_px, + int y_px, + int depth, + int8_t num_modes, + lcu_t *const lcu, + intra_search_data_t* chroma_data, + int8_t luma_mode) { const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4); @@ -980,84 +945,32 @@ int8_t uvg_search_intra_chroma_rdo(encoder_state_t * const state, if (reconstruct_chroma) { - - int c_width = MAX(32 >> (depth), 4); - uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0); uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0); - - cclm_parameters_t cclm_params[2] = { 0 }; - + const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - - struct { - double cost; - int8_t mode; - cclm_parameters_t cclm[2]; - } chroma, best_chroma; - - // chroma.cclm = cclm_params; - - best_chroma.mode = 0; - best_chroma.cost = MAX_INT; - - for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) { - chroma.mode = modes[chroma_mode_i]; - if (chroma.mode == -1) continue; - if(chroma.mode < 67 || depth == 0) { - uvg_intra_recon_cu(state, - x_px, y_px, - depth, - -1, chroma.mode, // skip luma - NULL, NULL, 0, false, false, lcu); + + for (int8_t i = 0; i < num_modes; ++i) { + const uint8_t mode = chroma_data[i].pred_cu.intra.mode_chroma; + uvg_intra_recon_cu(state, + x_px, y_px, + depth, &chroma_data[i], + &chroma_data[i].pred_cu, + lcu); + + if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) { + chroma_data[i].cost = uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu); + } else { + uvg_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu, &chroma_data[i].cost); } - else { - uvg_predict_cclm( - state, COLOR_U, - c_width, c_width, - x_px & ~7, y_px & ~7, - state->tile->frame->source->stride, - chroma.mode, - lcu, - &refs[0], NULL, - &cclm_params[0]); - - chroma.cclm[0] = cclm_params[0]; - - uvg_predict_cclm( - state, COLOR_V, - c_width, c_width, - x_px & ~7, y_px & ~7, - state->tile->frame->source->stride, - chroma.mode, - lcu, - &refs[1], NULL, - &cclm_params[1]); - - chroma.cclm[1] = cclm_params[1]; - - uvg_intra_recon_cu( - state, - x_px, y_px, - depth, - -1, chroma.mode, // skip luma - NULL, cclm_params, 0, false, false, lcu); - } - chroma.cost = uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); - - double mode_bits = uvg_chroma_mode_bits(state, chroma.mode, intra_mode); - chroma.cost += mode_bits * state->lambda; - - if (chroma.cost < best_chroma.cost) { - best_chroma = chroma; - } + double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode); + chroma_data[i].cost += mode_bits * state->lambda; } - best_cclm[0] = best_chroma.cclm[0]; - best_cclm[1] = best_chroma.cclm[1]; + sort_modes(chroma_data, num_modes); - return best_chroma.mode; + return chroma_data[0].pred_cu.intra.mode_chroma; } return 100; @@ -1066,18 +979,25 @@ int8_t uvg_search_intra_chroma_rdo(encoder_state_t * const state, int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state, const int x_px, const int y_px, - const int depth, lcu_t *lcu, cclm_parameters_t *best_cclm) + const int depth, lcu_t *lcu, intra_search_data_t *search_data) { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); int8_t intra_mode = cur_pu->intra.mode; - - double costs[8]; + int8_t modes[8] = { 0, 50, 18, 1, -1, 81, 82, 83 }; + uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5); if (intra_mode != 0 && intra_mode != 50 && intra_mode != 18 && intra_mode != 1) { modes[4] = intra_mode; } + else { + total_modes -= 1; + modes[4] = modes[5]; + modes[5] = modes[6]; + modes[6] = modes[7]; + } + // The number of modes to select for slower chroma search. Luma mode // is always one of the modes, so 2 means the final decision is made @@ -1087,13 +1007,21 @@ int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state, int num_modes = modes_in_depth[depth]; if (state->encoder_control->cfg.rdo >= 3) { - num_modes = state->encoder_control->cfg.cclm ? 8 : 5; + num_modes = total_modes; } + intra_search_data_t chroma_data[8]; + FILL(chroma_data, 0); + for (int i = 0; i < num_modes; i++) { + chroma_data[i].pred_cu = *cur_pu; + chroma_data[i].pred_cu.intra.mode_chroma = modes[i]; + chroma_data[i].pred_cu.intra.mode = -1; + } // Don't do rough mode search if all modes are selected. // FIXME: It might make more sense to only disable rough search if // num_modes is 0.is 0. - if (num_modes != 1 && num_modes != 5 && num_modes != 4 && num_modes != 8) { + + if (total_modes != num_modes) { const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2); const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; const vector2d_t luma_px = { x_px, y_px }; @@ -1109,16 +1037,18 @@ int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state, uvg_pixel *ref_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; search_intra_chroma_rough(state, x_px, y_px, depth, - ref_u, ref_v, LCU_WIDTH_C, + ref_u, ref_v, + LCU_WIDTH_C, &refs_u, &refs_v, - intra_mode, modes, costs, lcu); + chroma_data, lcu); + sort_modes(chroma_data, total_modes); } int8_t intra_mode_chroma = intra_mode; if (num_modes > 1) { - intra_mode_chroma = uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu, best_cclm); + intra_mode_chroma = uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode); } - + *search_data = chroma_data[0]; return intra_mode_chroma; } @@ -1127,25 +1057,29 @@ int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state, * Update lcu to have best modes at this depth. * \return Cost of best mode. */ -void uvg_search_cu_intra(encoder_state_t * const state, - const int x_px, const int y_px, - const int depth, lcu_t *lcu, - int8_t *mode_out, - int8_t *trafo_out, - double *cost_out, - uint8_t *multi_ref_idx_out, - bool *mip_flag_out, - bool * mip_transposed_out) +void uvg_search_cu_intra( + encoder_state_t * const state, + const int x_px, + const int y_px, + const int depth, + intra_search_data_t* mode_out, + lcu_t *lcu) { const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; const int8_t cu_width = LCU_WIDTH >> depth; + const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width, + MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) }; const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth; + const vector2d_t luma_px = { x_px, y_px }; + const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - uvg_intra_references refs; + uvg_intra_references refs[MAX_REF_LINE_IDX]; int8_t candidate_modes[INTRA_MPM_COUNT]; + // Normal intra modes + mrl modes + mip modes + intra_search_data_t search_data[UVG_NUM_INTRA_MODES +(MAX_REF_LINE_IDX - 1) * (INTRA_MPM_COUNT - 1) + 32]; cu_info_t *left_cu = 0; cu_info_t *above_cu = 0; @@ -1161,75 +1095,115 @@ void uvg_search_cu_intra(encoder_state_t * const state, uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu); if (depth > 0) { - const vector2d_t luma_px = { x_px, y_px }; - const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; - - // These references will only be used with rough search. No need for MRL stuff here. - uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs, state->encoder_control->cfg.wpp, NULL, 0); + uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0); } - int8_t modes[MAX_REF_LINE_IDX][67]; - int8_t trafo[MAX_REF_LINE_IDX][67] = { 0 }; - double costs[MAX_REF_LINE_IDX][67]; - - bool enable_mip = state->encoder_control->cfg.mip; - // The maximum number of mip modes is 32. Max modes can be less depending on block size. - // Half of the possible modes are transposed, which is indicated by a separate transpose flag - int8_t mip_modes[32]; - int8_t mip_trafo[32]; - double mip_costs[32]; - // The maximum number of possible MIP modes depend on block size & shape int width = LCU_WIDTH >> depth; int height = width; // TODO: proper height for non-square blocks. - int num_mip_modes = 0; - if (enable_mip) { - for (int i = 0; i < 32; ++i) { - mip_modes[i] = i; - mip_costs[i] = MAX_INT; - } - // MIP is not allowed for 64 x 4 or 4 x 64 blocks - if (!((width == 64 && height == 4) || (width == 4 && height == 64))) { - num_mip_modes = NUM_MIP_MODES_FULL(width, height); - } - } + // This is needed for bit cost calculation and requires too many parameters to be + // calculated inside the rough search functions + uint8_t mip_ctx = uvg_get_mip_flag_context(x_px, y_px, cu_width, cu_width, lcu, NULL); // Find best intra mode for 2Nx2N. uvg_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH]; - int8_t number_of_modes[MAX_REF_LINE_IDX] = { 0 }; + // Need to set some data for all cus + cu_info_t temp_pred_cu; + temp_pred_cu = *cur_cu; + temp_pred_cu.type = CU_INTRA; + FILL(temp_pred_cu.intra, 0); + + int16_t number_of_modes; bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4); if (!skip_rough_search) { - number_of_modes[0] = search_intra_rough(state, - ref_pixels, LCU_WIDTH, - &refs, + number_of_modes = search_intra_rough(state, + ref_pixels, + LCU_WIDTH, + refs, log2_width, candidate_modes, - modes[0], costs[0]); - // Copy rough results for other reference lines - for (int line = 1; line < MAX_REF_LINE_IDX; ++line) { - number_of_modes[line] = number_of_modes[0]; - for (int i = 0; i < number_of_modes[line]; ++i) { - modes[line][i] = modes[0][i]; - costs[line][i] = costs[0][i]; - } - } + search_data, &temp_pred_cu, + mip_ctx); + } else { - for(int line = 0; line < MAX_REF_LINE_IDX; ++line) { - number_of_modes[line] = 67; - for (int i = 0; i < number_of_modes[line]; ++i) { - modes[line][i] = i; - costs[line][i] = MAX_INT; - } + for (int8_t i = 0; i < UVG_NUM_INTRA_MODES; i++) { + search_data[i].pred_cu = temp_pred_cu; + search_data[i].pred_cu.intra.mode = i; + search_data[i].pred_cu.intra.mode_chroma = i; + search_data[i].cost = MAX_INT; } + number_of_modes = UVG_NUM_INTRA_MODES; } - uint8_t lines = 1; - // Find modes with multiple reference lines if in use. Do not use if CU in first row. - if (state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0) { - lines = MAX_REF_LINE_IDX; + int num_mip_modes = 0; + if (state->encoder_control->cfg.mip) { + // MIP is not allowed for 64 x 4 or 4 x 64 blocks + if (!((width == 64 && height == 4) || (width == 4 && height == 64))) { + num_mip_modes = NUM_MIP_MODES_FULL(width, height); + + for (int transpose = 0; transpose < 2; transpose++) { + const int half_mip_modes = NUM_MIP_MODES_HALF(width, height); + for (int i = 0; i < half_mip_modes; ++i) { + const int index = i + number_of_modes + transpose * half_mip_modes; + search_data[index].pred_cu = temp_pred_cu; + search_data[index].pred_cu.intra.mip_flag = 1; + search_data[index].pred_cu.intra.mode = i; + search_data[index].pred_cu.intra.mip_is_transposed = transpose; + search_data[index].pred_cu.intra.mode_chroma = i; + search_data[index].cost = MAX_INT; + } + } + if(!skip_rough_search) { + get_rough_cost_for_2n_modes(state, refs, &cu_loc, + ref_pixels, + LCU_WIDTH, search_data + number_of_modes, num_mip_modes, + mip_ctx); + } + } + number_of_modes += num_mip_modes; } + int num_mrl_modes = 0; + // Find modes with multiple reference lines if in use. Do not use if CU in first row. + uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1; + + for(int line = 1; line < lines; ++line) { + uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 }; + + if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0) { + videoframe_t* const frame = state->tile->frame; + + // Copy extra ref lines, including ref line 1 and top left corner. + for (int i = 0; i < MAX_REF_LINE_IDX; ++i) { + int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX; + height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist. + height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX); + uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)], + &extra_refs[i * 128], + 1, height, + frame->rec->stride, 1); + } + } + uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line); + for(int i = 1; i < INTRA_MPM_COUNT; i++) { + num_mrl_modes++; + const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes; + search_data[index].pred_cu = temp_pred_cu; + search_data[index].pred_cu.intra.mode = candidate_modes[i]; + search_data[index].pred_cu.intra.multi_ref_idx = line; + search_data[index].pred_cu.intra.mode_chroma = candidate_modes[i]; + search_data[index].cost = MAX_INT; + } + } + if (!skip_rough_search && lines != 1) { + get_rough_cost_for_2n_modes(state, refs, &cu_loc, + ref_pixels, + LCU_WIDTH, search_data + number_of_modes, num_mrl_modes, + mip_ctx); + } + number_of_modes += num_mrl_modes; + // Set transform depth to current depth, meaning no transform splits. uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); // Refine results with slower search or get some results if rough search was skipped. @@ -1237,79 +1211,56 @@ void uvg_search_cu_intra(encoder_state_t * const state, if (rdo_level >= 2 || skip_rough_search) { int number_of_modes_to_search; if (rdo_level == 4) { - number_of_modes_to_search = 67; + number_of_modes_to_search = number_of_modes; } else if (rdo_level == 2 || rdo_level == 3) { number_of_modes_to_search = (cu_width == 4) ? 3 : 2; } else { // Check only the predicted modes. number_of_modes_to_search = 0; } - - for(int8_t line = 0; line < lines; ++line) { - // For extra reference lines, only check predicted modes & no MIP search. - if (line != 0) { - number_of_modes_to_search = 0; - num_mip_modes = 0; + if(!skip_rough_search) { + sort_modes(search_data, number_of_modes); + } + + for(int pred_mode = 0; pred_mode < INTRA_MPM_COUNT; ++pred_mode) { + bool mode_found = false; + for(int i = 0; i < number_of_modes_to_search; i++) { + if(search_data[i].pred_cu.intra.mode == candidate_modes[pred_mode]) { + mode_found = true; + break; + } + } + if(!mode_found) { + search_data[number_of_modes_to_search].pred_cu = temp_pred_cu; + search_data[number_of_modes_to_search].pred_cu.intra.mode = candidate_modes[pred_mode]; + search_data[number_of_modes_to_search].pred_cu.intra.mode_chroma = candidate_modes[pred_mode]; + number_of_modes_to_search++; } - int num_modes_to_check = MIN(number_of_modes[line], number_of_modes_to_search); - uvg_sort_modes(modes[line], costs[line], number_of_modes[line]); - // TODO: if rough search is implemented for MIP, sort mip_modes here. - number_of_modes[line] = search_intra_rdo(state, - x_px, y_px, depth, - ref_pixels, LCU_WIDTH, - candidate_modes, - num_modes_to_check, - modes[line], trafo[line], costs[line], - num_mip_modes, - mip_modes, mip_trafo, mip_costs, - lcu, line); } + + // TODO: if rough search is implemented for MIP, sort mip_modes here. + search_intra_rdo( + state, + x_px, + y_px, + depth, + number_of_modes_to_search, + search_data, + lcu); + // Reset these + search_data[0].pred_cu.violates_mts_coeff_constraint = false; + search_data[0].pred_cu.mts_last_scan_pos = false; } - - uint8_t best_line = 0; - double best_line_mode_cost = costs[0][0]; - uint8_t best_mip_mode_idx = 0; - uint8_t best_mode_indices[MAX_REF_LINE_IDX]; - - int8_t tmp_best_mode; - int8_t tmp_best_trafo; - double tmp_best_cost; - bool tmp_mip_flag = false; - bool tmp_mip_transp = false; - - for (int line = 0; line < lines; ++line) { - best_mode_indices[line] = select_best_mode_index(modes[line], costs[line], number_of_modes[line]); - if (best_line_mode_cost > costs[line][best_mode_indices[line]]) { - best_line_mode_cost = costs[line][best_mode_indices[line]]; - best_line = line; + else { + double best_cost = MAX_INT; + int best_mode = 0; + for (int mode = 0; mode < number_of_modes; mode++) { + if (search_data[mode].cost < best_cost) { + best_cost = search_data[mode].cost; + best_mode = mode; + } } + search_data[0] = search_data[best_mode]; } - - tmp_best_mode = modes[best_line][best_mode_indices[best_line]]; - tmp_best_trafo = trafo[best_line][best_mode_indices[best_line]]; - tmp_best_cost = costs[best_line][best_mode_indices[best_line]]; - - if (num_mip_modes) { - best_mip_mode_idx = select_best_mode_index(mip_modes, mip_costs, num_mip_modes); - if (tmp_best_cost > mip_costs[best_mip_mode_idx]) { - tmp_best_mode = mip_modes[best_mip_mode_idx]; - tmp_best_trafo = mip_trafo[best_mip_mode_idx]; - tmp_best_cost = mip_costs[best_mip_mode_idx]; - tmp_mip_flag = true; - tmp_mip_transp = (tmp_best_mode >= (num_mip_modes >> 1)) ? 1 : 0; - } - } - - if (tmp_mip_flag) { - // Transform best mode index to proper form. - // Max mode index is half of max number of modes - 1 (i. e. for size id 2, max mode id is 5) - tmp_best_mode = (tmp_mip_transp ? tmp_best_mode - (num_mip_modes >> 1) : tmp_best_mode); - } - - *mode_out = tmp_best_mode; - *trafo_out = tmp_best_trafo; - *cost_out = tmp_best_cost; - *mip_flag_out = tmp_mip_flag; - *mip_transposed_out = tmp_mip_transp; - *multi_ref_idx_out = tmp_mip_flag ? 0 : best_line; + *mode_out = search_data[0]; } diff --git a/src/search_intra.h b/src/search_intra.h index ea73156b..7bcb6480 100644 --- a/src/search_intra.h +++ b/src/search_intra.h @@ -43,24 +43,21 @@ #include "global.h" // IWYU pragma: keep #include "intra.h" -double uvg_luma_mode_bits(const encoder_state_t *state, - int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id); +double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu); double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode); int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state, const int x_px, const int y_px, - const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm); + const int depth, lcu_t *lcu, intra_search_data_t* best_cclm); -void uvg_search_cu_intra(encoder_state_t * const state, - const int x_px, const int y_px, - const int depth, lcu_t *lcu, - int8_t *mode_out, - int8_t *trafo_out, - double *cost_out, - uint8_t *multi_ref_idx_out, - bool *mip_flag, - bool *mip_transp); +void uvg_search_cu_intra( + encoder_state_t * const state, + const int x_px, + const int y_px, + const int depth, + intra_search_data_t* search_data, + lcu_t *lcu); #endif // SEARCH_INTRA_H_ diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index 6ab6994d..a4ea1d58 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -225,39 +225,40 @@ int uvg_quant_cbcr_residual_generic( int64_t best_cost = INT64_MAX; // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM - for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) { + for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) { int64_t d1 = 0; + const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1); for (int y = 0; y < width; y++) { for (int x = 0; x < width; x++) { int cbx = u_residual[x + y * width], crx = v_residual[x + y * width]; - if (cbf_mask == 1) + if (cbf_mask == 2) { - u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1)); + u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1)); } - else if (cbf_mask == -1) + else if (cbf_mask == -2) { - u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1)); + u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1)); } else if (cbf_mask == 3) { - u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]); + u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]); } else if (cbf_mask == -3) { - u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]); + u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]); } - else if (cbf_mask == 2) + else if (cbf_mask == 1) { v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5); d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]); } - else if (cbf_mask == -2) + else if (cbf_mask == -1) { v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5); d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]); @@ -270,19 +271,19 @@ int uvg_quant_cbcr_residual_generic( } } if (d1 < best_cost) { - best_cbf_mask = cbf_mask; + best_cbf_mask = i; best_cost = d1; } } - uvg_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu); + uvg_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu); if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip)) { int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); - uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, tr_depth, cur_cu->cbf); } else if (state->encoder_control->cfg.rdoq_enable && false) { @@ -290,7 +291,7 @@ int uvg_quant_cbcr_residual_generic( scan_order); } else { - uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); } @@ -309,10 +310,10 @@ int uvg_quant_cbcr_residual_generic( int y, x; // Get quantized residual. (coeff_out -> coeff -> residual) - uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); - uvg_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu); + uvg_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu); //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { @@ -333,32 +334,32 @@ int uvg_quant_cbcr_residual_generic( // } // } //} - + const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1); // Get quantized reconstruction. (residual + pred_in -> rec_out) for (int y = 0; y < width; y++) { for (int x = 0; x < width; x++) { - if (best_cbf_mask == 1) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1; + if (temp == 2) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1; } - else if (best_cbf_mask == -1) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1; + else if (temp == -2) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1; } - else if (best_cbf_mask == 3) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; + else if (temp == 3) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; } - else if (best_cbf_mask == -3) { + else if (temp == -3) { // non-normative clipping to prevent 16-bit overflow - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x]; - v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width]; + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x]; + v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width]; } - else if (best_cbf_mask == 2) { + else if (temp == 1) { u_residual[x + y * width] = v1_residual[x + y * width] >> 1; v_residual[x + y * width] = v1_residual[x + y * width]; } - else if (best_cbf_mask == -2) { + else if (temp == -1) { u_residual[x + y * width] = v1_residual[x + y * width] >> 1; v_residual[x + y * width] = -v1_residual[x + y * width]; } diff --git a/src/transform.c b/src/transform.c index 925964f2..4ca02c72 100644 --- a/src/transform.c +++ b/src/transform.c @@ -260,11 +260,9 @@ int uvg_quantize_residual_trskip( struct { uvg_pixel rec[LCU_WIDTH * LCU_WIDTH]; coeff_t coeff[LCU_WIDTH * LCU_WIDTH]; - uint32_t cost; + double cost; int has_coeffs; } skip, *best; - - const int bit_cost = (int)(state->lambda + 0.5); //noskip.has_coeffs = uvg_quantize_residual( // state, cur_cu, width, color, scan_order, @@ -278,7 +276,7 @@ int uvg_quantize_residual_trskip( 1, in_stride, width, ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj); skip.cost = uvg_pixels_calc_ssd(ref_in, skip.rec, in_stride, width, width); - skip.cost += uvg_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * bit_cost; + skip.cost += uvg_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * state->frame->lambda; /* if (noskip.cost <= skip.cost) { *trskip_out = 0; @@ -481,15 +479,17 @@ static void quantize_tr_residual(encoder_state_t * const state, * - lcu->cbf coded block flags for the area * - lcu->cu.intra.tr_skip tr skip flags for the area (in case of luma) */ -void uvg_quantize_lcu_residual(encoder_state_t * const state, - const bool luma, - const bool chroma, - const int32_t x, - const int32_t y, - const uint8_t depth, - cu_info_t *cur_pu, - lcu_t* lcu, - bool early_skip) +void uvg_quantize_lcu_residual( + encoder_state_t * const state, + const bool luma, + const bool chroma, + const bool jccr, + const int32_t x, + const int32_t y, + const uint8_t depth, + cu_info_t *cur_pu, + lcu_t* lcu, + bool early_skip) { const int32_t width = LCU_WIDTH >> depth; const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) }; @@ -511,7 +511,7 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state, if (luma) { cbf_clear(&cur_pu->cbf, depth, COLOR_Y); } - if (chroma) { + if (chroma || jccr) { cbf_clear(&cur_pu->cbf, depth, COLOR_U); cbf_clear(&cur_pu->cbf, depth, COLOR_V); } @@ -523,10 +523,11 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state, const int32_t x2 = x + offset; const int32_t y2 = y + offset; - uvg_quantize_lcu_residual(state, luma, chroma, x, y, depth + 1, NULL, lcu, early_skip); - uvg_quantize_lcu_residual(state, luma, chroma, x2, y, depth + 1, NULL, lcu, early_skip); - uvg_quantize_lcu_residual(state, luma, chroma, x, y2, depth + 1, NULL, lcu, early_skip); - uvg_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu, early_skip); + // jccr is currently not supported if transform is split + uvg_quantize_lcu_residual(state, luma, chroma, 0, x, y, depth + 1, NULL, lcu, early_skip); + uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y, depth + 1, NULL, lcu, early_skip); + uvg_quantize_lcu_residual(state, luma, chroma, 0, x, y2, depth + 1, NULL, lcu, early_skip); + uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip); // Propagate coded block flags from child CUs to parent CU. uint16_t child_cbfs[3] = { @@ -548,10 +549,10 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state, } if (chroma) { quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip); - quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip); - if(state->encoder_control->cfg.jccr && cur_pu->tr_depth == cur_pu->depth){ - quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip); - } + quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip); + } + if (jccr && cur_pu->tr_depth == cur_pu->depth) { + quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip); } } } diff --git a/src/transform.h b/src/transform.h index a7fa232e..6a4f0bb9 100644 --- a/src/transform.h +++ b/src/transform.h @@ -67,14 +67,16 @@ void uvg_itransform2d(const encoder_control_t * const encoder, int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale); -void uvg_quantize_lcu_residual(encoder_state_t *state, - bool luma, - bool chroma, - int32_t x, - int32_t y, - uint8_t depth, - cu_info_t *cur_cu, - lcu_t* lcu, - bool early_skip); +void uvg_quantize_lcu_residual( + encoder_state_t *state, + bool luma, + bool chroma, + const bool jccr, + int32_t x, + int32_t y, + uint8_t depth, + cu_info_t *cur_cu, + lcu_t* lcu, + bool early_skip); #endif diff --git a/src/uvg266.h b/src/uvg266.h index 4ecc8d48..0593a605 100644 --- a/src/uvg266.h +++ b/src/uvg266.h @@ -267,6 +267,12 @@ enum uvg_amvr_resolution UVG_IMV_HPEL = 3 }; +enum uvg_roi_format +{ + UVG_ROI_TXT = 0, + UVG_ROI_BIN = 1 +}; + // Map from input format to chroma format. #define UVG_FORMAT2CSP(format) ((enum uvg_chroma_format)format) @@ -408,10 +414,9 @@ typedef struct uvg_config int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */ struct { - int32_t width; - int32_t height; - int8_t *dqps; - } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */ + char *file_path; + enum uvg_roi_format format; + } roi; /*!< \brief Specify delta QPs for region of interest coding. */ unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */ @@ -524,6 +529,12 @@ typedef struct uvg_config int8_t cclm; int8_t amvr; /* \brief Adaptive motion vector resolution parameter */ + + /** \brief whether to try combining intra cus at the lower depth when search + * is not performed at said depth*/ + uint8_t combine_intra_cus; + + uint8_t force_inter; } uvg_config; /** @@ -555,6 +566,14 @@ typedef struct uvg_picture { enum uvg_chroma_format chroma_format; int32_t ref_pocs[16]; + + struct + { + int width; + int height; + int8_t *roi_array; + } roi; + } uvg_picture; /** @@ -780,6 +799,9 @@ typedef struct uvg_api { * the bitstream, length of the bitstream, the reconstructed frame, the * original frame and frame info in data_out, len_out, pic_out, src_out and * info_out, respectively. Otherwise, set the output parameters to NULL. + * + * Region of interest (ROI) / delta QP map can be specified in the input + * picture's ROI field but only when a ROI file is not used. * * After passing all of the input frames, the caller should keep calling this * function with pic_in set to NULL, until no more data is returned in the diff --git a/tests/test_slices.sh b/tests/test_slices.sh index 512888b0..a4166036 100755 --- a/tests/test_slices.sh +++ b/tests/test_slices.sh @@ -3,6 +3,6 @@ set -eu . "${0%/*}/util.sh" -valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --tiles=2x2 +valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --gop 0 --tiles=2x2 #valgrind_test 264x130 10 --threads=2 --owf=1 --preset=ultrafast --slices=wpp #if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 --threads=2 --owf=1 --preset=fast --slices=wpp --no-open-gop; fi