From 20d0a9b65e48b24b3adb43abffca4ff9f857f8a5 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 17 Jun 2022 09:15:01 +0300 Subject: [PATCH 01/36] [ibc] Add `--ibc` parameter and config values for Intra Block Copy --- src/cfg.c | 11 +++++++++++ src/cli.c | 1 + src/cu.h | 1 + src/encoder_state-bitstream.c | 6 +++++- src/global.h | 6 ++++++ src/uvg266.h | 3 +++ 6 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/cfg.c b/src/cfg.c index 6f3cbfef..c7c0ef9e 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -221,6 +221,9 @@ int uvg_config_init(uvg_config *cfg) cfg->cabac_debug_file_name = NULL; cfg->dual_tree = 0; + + cfg->ibc = 0; + return 1; } @@ -1475,6 +1478,14 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) else if OPT("dual-tree") { cfg->dual_tree = atobool(value); } + else if OPT ("ibc") { + int ibc_value = atoi(value); + if (ibc_value < 0 || ibc_value > 2) { + fprintf(stderr, "ibc supports only range from 0 to 2\n"); + return 0; + } + cfg->ibc = (uint8_t)ibc_value; + } else { return 0; } diff --git a/src/cli.c b/src/cli.c index 53f2df9b..3afaed62 100644 --- a/src/cli.c +++ b/src/cli.c @@ -191,6 +191,7 @@ static const struct option long_options[] = { { "dual-tree", no_argument, NULL, 0 }, { "no-dual-tree", no_argument, NULL, 0 }, { "cabac-debug-file", required_argument, NULL, 0 }, + { "ibc", required_argument, NULL, 0 }, {0, 0, 0, 0} }; diff --git a/src/cu.h b/src/cu.h index 74ff25a6..e3555d08 100644 --- a/src/cu.h +++ b/src/cu.h @@ -52,6 +52,7 @@ typedef enum { CU_INTRA = 1, CU_INTER = 2, CU_PCM = 3, + CU_IBC = 4, } cu_type_t; typedef enum { diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 3ef5c64e..b19ab758 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -694,7 +694,11 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream, WRITE_UE(stream, 0, "sps_internal_bit_depth_minus_input_bit_depth"); } - WRITE_U(stream, 0, 1, "sps_ibc_enabled_flag"); + WRITE_U(stream, encoder->cfg.ibc > 0 ? 1 : 0, 1, "sps_ibc_enabled_flag"); + + if (encoder->cfg.ibc) { + WRITE_UE(stream,6 - IBC_MRG_MAX_NUM_CANDS, "sps_six_minus_max_num_ibc_merge_cand"); + } #if LUMA_ADAPTIVE_DEBLOCKING_FILTER_QP_OFFSET // if(!no_ladf_constraint_flag) diff --git a/src/global.h b/src/global.h index 448ea1f1..1c2da76f 100644 --- a/src/global.h +++ b/src/global.h @@ -254,6 +254,12 @@ typedef int32_t mv_t; #define AMVP_MAX_NUM_CANDS 2 #define AMVP_MAX_NUM_CANDS_MEM 3 #define MRG_MAX_NUM_CANDS 6 +/** + * \brief Max number of merge candidates in Intra Block Copy + * + */ +#define IBC_MRG_MAX_NUM_CANDS 6 + #define MAX_NUM_HMVP_CANDS 5 diff --git a/src/uvg266.h b/src/uvg266.h index 1801c8ac..0c449913 100644 --- a/src/uvg266.h +++ b/src/uvg266.h @@ -541,6 +541,9 @@ typedef struct uvg_config char* cabac_debug_file_name; uint8_t dual_tree; + + uint8_t ibc; /* \brief Intra Block Copy parameter */ + } uvg_config; /** From 6ec4c37b472da5c6b090e4ddc3ede7e801fc43ca Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 27 Jun 2022 07:36:54 +0300 Subject: [PATCH 02/36] [ibc] Add IBC Flag context and code the bits, disable by default for now --- src/cabac.h | 1 + src/context.c | 8 ++++++++ src/encode_coding_tree.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/cabac.h b/src/cabac.h index 6f7aaa78..be249ba2 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -122,6 +122,7 @@ typedef struct cabac_ctx_t transform_skip_gt2[5]; cabac_ctx_t cclm_flag; cabac_ctx_t cclm_model; + cabac_ctx_t ibc_flag[3]; } ctx; } cabac_data_t; diff --git a/src/context.c b/src/context.c index 8e042cc2..83bd5502 100644 --- a/src/context.c +++ b/src/context.c @@ -423,6 +423,13 @@ static const uint8_t INIT_CCLM_MODEL[4] = { 9, }; +static const uint8_t INIT_IBC_FLAG[4][3] = { + { 0, 43, 45, }, + { 0, 57, 44, }, + { 17, 42, 36, }, + { 1, 5, 8, }, +}; + /* static const uint16_t g_inistateToCount[128] = { 614, 647, 681, 718, 756, 797, 839, 884, 932, 982, 1034, 1089, 1148, 1209, 1274, 1342, @@ -514,6 +521,7 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice) uvg_ctx_init(&cabac->ctx.lfnst_idx_model[i], QP, INIT_LFNST_IDX[slice][i], INIT_LFNST_IDX[3][i]); uvg_ctx_init(&cabac->ctx.transform_skip_sig_coeff_group[i], QP, INIT_TRANSFORM_SKIP_SIG_COEFF_GROUP[slice][i], INIT_TRANSFORM_SKIP_SIG_COEFF_GROUP[3][i]); uvg_ctx_init(&cabac->ctx.transform_skip_sig[i], QP, INIT_TRANSFORM_SKIP_SIG[slice][i], INIT_TRANSFORM_SKIP_SIG[3][i]); + uvg_ctx_init(&cabac->ctx.ibc_flag[i], QP, INIT_IBC_FLAG[slice][i], INIT_IBC_FLAG[3][i]); } for (i = 0; i < 4; i++) { diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 0552e211..cb27099b 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1555,7 +1555,7 @@ void uvg_encode_coding_tree( } // Encode skip flag - if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { + if ((state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) && cu_width != 4) { int8_t ctx_skip = 0; @@ -1570,6 +1570,15 @@ void uvg_encode_coding_tree( CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag"); if (cur_cu->skipped) { + + if (state->encoder_control->cfg.ibc) { // ToDo: Only for luma channel + // ToDo: Disable for blocks over 64x64 pixels + int8_t ctx_ibc = 0; + if (left_cu && left_cu->type == CU_IBC) ctx_ibc++; + if (above_cu && above_cu->type == CU_IBC) ctx_ibc++; + cabac->cur_ctx = &(cabac->ctx.ibc_flag[ctx_ibc]); + CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag"); + } DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu); uvg_hmvp_add_mv(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu); int16_t num_cand = state->encoder_control->cfg.max_merge; @@ -1597,6 +1606,15 @@ void uvg_encode_coding_tree( } // Prediction mode + if (state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc) { // ToDo: Only for luma channel + // ToDo: Disable for blocks over 64x64 pixels + int8_t ctx_ibc = 0; + if (left_cu && left_cu->type == CU_IBC) ctx_ibc++; + if (above_cu && above_cu->type == CU_IBC) ctx_ibc++; + cabac->cur_ctx = &(cabac->ctx.ibc_flag[ctx_ibc]); + CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag"); + } + if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { int8_t ctx_predmode = 0; @@ -1607,6 +1625,15 @@ void uvg_encode_coding_tree( cabac->cur_ctx = &(cabac->ctx.cu_pred_mode_model[ctx_predmode]); CABAC_BIN(cabac, (cur_cu->type == CU_INTRA), "PredMode"); + + // We need IBC flag if the mode is signalled as Inter + if (state->encoder_control->cfg.ibc && cur_cu->type != CU_INTRA) { + int8_t ctx_ibc = 0; + if (left_cu && left_cu->type == CU_IBC) ctx_ibc++; + if (above_cu && above_cu->type == CU_IBC) ctx_ibc++; + cabac->cur_ctx = &(cabac->ctx.ibc_flag[ctx_ibc]); + CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag"); + } } // part_mode From b49d32af214b6ea24b02eb22bdf14853132c70c7 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 29 Jun 2022 08:59:20 +0300 Subject: [PATCH 03/36] [ibc] Add IBC buffers --- src/cu.h | 2 +- src/encode_coding_tree.c | 95 +------------------------------- src/encoder_state-ctors_dtors.c | 21 +++++++ src/encoderstate.c | 52 +++++++++++++++++ src/encoderstate.h | 3 - src/global.h | 4 +- src/inter.c | 98 +++++++++++++++++++-------------- src/search.c | 8 +-- src/videoframe.h | 3 + 9 files changed, 143 insertions(+), 143 deletions(-) diff --git a/src/cu.h b/src/cu.h index e3555d08..ddddaf55 100644 --- a/src/cu.h +++ b/src/cu.h @@ -147,7 +147,7 @@ enum uvg_tree_type { */ typedef struct { - uint8_t type : 2; //!< \brief block type, one of cu_type_t values + uint8_t type : 3; //!< \brief block type, one of cu_type_t values uint8_t depth : 3; //!< \brief depth / size of this block uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values uint8_t tr_depth : 3; //!< \brief transform depth diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index cb27099b..fa73e08e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1262,95 +1262,6 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, if (cabac->only_count && bits_out) *bits_out += bits; } -/** -static void encode_part_mode(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int depth) -{ - // Binarization from Table 9-34 of the HEVC spec: - // - // | log2CbSize > | log2CbSize == - // | MinCbLog2SizeY | MinCbLog2SizeY - // -------+-------+----------+---------+-----------+---------- - // pred | part | AMP | AMP | | - // mode | mode | disabled | enabled | size == 8 | size > 8 - // -------+-------+----------+---------+-----------+---------- - // intra | 2Nx2N | - - | 1 1 - // | NxN | - - | 0 0 - // -------+-------+--------------------+---------------------- - // inter | 2Nx2N | 1 1 | 1 1 - // | 2NxN | 01 011 | 01 01 - // | Nx2N | 00 001 | 00 001 - // | NxN | - - | - 000 - // | 2NxnU | - 0100 | - - - // | 2NxnD | - 0101 | - - - // | nLx2N | - 0000 | - - - // | nRx2N | - 0001 | - - - // -------+-------+--------------------+---------------------- - // - // - // Context indices from Table 9-37 of the HEVC spec: - // - // binIdx - // | 0 1 2 3 - // ------------------------------+------------------ - // log2CbSize == MinCbLog2SizeY | 0 1 2 bypass - // log2CbSize > MinCbLog2SizeY | 0 1 3 bypass - // ------------------------------+------------------ - double bits = 0; - if (cur_cu->type == CU_INTRA) { - if (depth == MAX_DEPTH) { - cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); - if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); - } else { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN"); - } - } - } else { - - cabac->cur_ctx = &(cabac->ctx.part_size_model[0]); - if (cur_cu->part_size == SIZE_2Nx2N) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N"); - return bits; - } - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split"); - - cabac->cur_ctx = &(cabac->ctx.part_size_model[1]); - if (cur_cu->part_size == SIZE_2NxN || - cur_cu->part_size == SIZE_2NxnU || - cur_cu->part_size == SIZE_2NxnD) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical"); - } else { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal"); - } - - if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) { - cabac->cur_ctx = &(cabac->ctx.part_size_model[3]); - - if (cur_cu->part_size == SIZE_2NxN || - cur_cu->part_size == SIZE_Nx2N) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP"); - return bits; - } - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP"); - - if (cur_cu->part_size == SIZE_2NxnU || - cur_cu->part_size == SIZE_nLx2N) { - CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP"); - if(cabac->only_count) bits += 1; - } else { - CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP"); - if(cabac->only_count) bits += 1; - } - } - } - return bits; -} -**/ - - bool uvg_write_split_flag( const encoder_state_t * const state, cabac_data_t* cabac, @@ -1684,7 +1595,7 @@ void uvg_encode_coding_tree( } else #endif - if (cur_cu->type == CU_INTER) { + if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { uint8_t imv_mode = UVG_IMV_OFF; const int num_pu = uvg_part_mode_num_parts[cur_cu->part_size]; @@ -1706,10 +1617,10 @@ void uvg_encode_coding_tree( // 0 = off, 1 = fullpel, 2 = 4-pel, 3 = half-pel if (ctrl->cfg.amvr && non_zero_mvd) { cabac->cur_ctx = &(cabac->ctx.imv_flag[0]); - CABAC_BIN(cabac, (imv_mode > UVG_IMV_OFF), "imv_flag"); + if(cur_cu->type != CU_IBC) CABAC_BIN(cabac, (imv_mode > UVG_IMV_OFF), "imv_flag"); if (imv_mode > UVG_IMV_OFF) { cabac->cur_ctx = &(cabac->ctx.imv_flag[4]); - CABAC_BIN(cabac, (imv_mode < UVG_IMV_HPEL), "imv_flag"); + if(cur_cu->type != CU_IBC) CABAC_BIN(cabac, (imv_mode < UVG_IMV_HPEL), "imv_flag"); if (imv_mode < UVG_IMV_HPEL) { cabac->cur_ctx = &(cabac->ctx.imv_flag[1]); CABAC_BIN(cabac, (imv_mode > UVG_IMV_FPEL), "imv_flag"); // 1 indicates 4PEL, 0 FPEL diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c index bb1300af..037f61d8 100644 --- a/src/encoder_state-ctors_dtors.c +++ b/src/encoder_state-ctors_dtors.c @@ -122,6 +122,18 @@ static int encoder_state_config_tile_init(encoder_state_t * const state, state->tile->frame->hmvp_lut = malloc(sizeof(cu_info_t) * height_in_lcu * MAX_NUM_HMVP_CANDS); state->tile->frame->hmvp_size = calloc(1, sizeof(uint8_t) * height_in_lcu); + if (state->encoder_control->cfg.ibc) { + // Allocate pixel buffer for each LCU row + state->tile->frame->ibc_buffer_y = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); + state->tile->frame->ibc_buffer_u = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); + state->tile->frame->ibc_buffer_v = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); + for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { + state->tile->frame->ibc_buffer_y[i] = (uvg_pixel*)malloc(IBC_BUFFER_SIZE * 3); // ToDo: we don't need this much, but it would also support 4:4:4 + state->tile->frame->ibc_buffer_u[i] = &state->tile->frame->ibc_buffer_y[i][IBC_BUFFER_SIZE]; + state->tile->frame->ibc_buffer_v[i] = &state->tile->frame->ibc_buffer_y[i][IBC_BUFFER_SIZE * 2]; + } + } + state->tile->frame->rec = NULL; state->tile->frame->source = NULL; @@ -197,6 +209,15 @@ static void encoder_state_config_tile_finalize(encoder_state_t * const state) { FREE_POINTER(state->tile->frame->hmvp_lut); FREE_POINTER(state->tile->frame->hmvp_size); + if (state->encoder_control->cfg.ibc) { + for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { + FREE_POINTER(state->tile->frame->ibc_buffer_y[i]); + } + FREE_POINTER(state->tile->frame->ibc_buffer_y); + FREE_POINTER(state->tile->frame->ibc_buffer_u); + FREE_POINTER(state->tile->frame->ibc_buffer_v); + } + uvg_videoframe_free(state->tile->frame); state->tile->frame = NULL; FREE_POINTER(state->tile->wf_jobs); diff --git a/src/encoderstate.c b/src/encoderstate.c index 9bed1b86..bee55980 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -250,6 +250,58 @@ static void encoder_state_recdata_to_bufs(encoder_state_t * const state, frame->rec->stride / 2, 1); } } + + // Fill IBC buffer + if (state->encoder_control->cfg.ibc) { + + uint32_t ibc_buffer_pos_x = lcu->position_px.x + LCU_WIDTH > IBC_BUFFER_WIDTH ? IBC_BUFFER_WIDTH - LCU_WIDTH: lcu->position_px.x; + uint32_t ibc_buffer_pos_x_c = ibc_buffer_pos_x >> 1; + uint32_t ibc_buffer_row = lcu->position_px.y / LCU_WIDTH; + + // If the buffer is full shift all the lines LCU_WIDTH left + if (lcu->position_px.x + LCU_WIDTH > IBC_BUFFER_WIDTH) { + for (uint32_t i = 0; i < LCU_WIDTH; i++) { + memmove( + &frame->ibc_buffer_y[ibc_buffer_row][i * IBC_BUFFER_WIDTH], + &frame->ibc_buffer_y[ibc_buffer_row][i * IBC_BUFFER_WIDTH + LCU_WIDTH], + sizeof(uvg_pixel) * (IBC_BUFFER_WIDTH - LCU_WIDTH)); + } + if (state->encoder_control->chroma_format != UVG_CSP_400) { + for (uint32_t i = 0; i < LCU_WIDTH_C; i++) { + memmove( + &frame->ibc_buffer_u[ibc_buffer_row][i * IBC_BUFFER_WIDTH_C], + &frame->ibc_buffer_u[ibc_buffer_row] + [i * IBC_BUFFER_WIDTH_C + LCU_WIDTH_C], + sizeof(uvg_pixel) * (IBC_BUFFER_WIDTH_C - LCU_WIDTH_C)); + memmove( + &frame->ibc_buffer_v[ibc_buffer_row][i * IBC_BUFFER_WIDTH_C], + &frame->ibc_buffer_v[ibc_buffer_row] + [i * IBC_BUFFER_WIDTH_C + LCU_WIDTH_C], + sizeof(uvg_pixel) * (IBC_BUFFER_WIDTH_C - LCU_WIDTH_C)); + } + } + } + + const uint32_t ibc_block_width = MIN(LCU_WIDTH, (state->tile->frame->width-lcu->position_px.x)); + const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y)); + + uvg_pixels_blit(&frame->rec->y[lcu->position_px.y * frame->rec->stride + lcu->position_px.x], + &frame->ibc_buffer_y[ibc_buffer_row][ibc_buffer_pos_x], + ibc_block_width, ibc_block_height, + frame->rec->stride, IBC_BUFFER_WIDTH); + + if (state->encoder_control->chroma_format != UVG_CSP_400) { + uvg_pixels_blit(&frame->rec->u[(lcu->position_px.y >> 1) * (frame->rec->stride >> 1) + (lcu->position_px.x >> 1)], + &frame->ibc_buffer_u[ibc_buffer_row][ibc_buffer_pos_x_c], + ibc_block_width>>1, ibc_block_height>>1, + frame->rec->stride >> 1, IBC_BUFFER_WIDTH_C); + uvg_pixels_blit(&frame->rec->v[(lcu->position_px.y >> 1) * (frame->rec->stride >> 1) + (lcu->position_px.x >> 1)], + &frame->ibc_buffer_v[ibc_buffer_row][ibc_buffer_pos_x_c], + ibc_block_width>>1, ibc_block_height>>1, + frame->rec->stride >> 1, IBC_BUFFER_WIDTH_C); + + } + } } diff --git a/src/encoderstate.h b/src/encoderstate.h index 40e1dc24..55d265e3 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -192,9 +192,6 @@ typedef struct encoder_state_config_frame_t { double *c_para; double *k_para; - - cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row - uint8_t* hmvp_size; //!< \brief HMVP LUT size bool jccr_sign; } encoder_state_config_frame_t; diff --git a/src/global.h b/src/global.h index 1c2da76f..773f9c15 100644 --- a/src/global.h +++ b/src/global.h @@ -176,7 +176,6 @@ typedef int32_t mv_t; //! pow(2, MIN_SIZE) #define CU_MIN_SIZE_PIXELS (1 << MIN_SIZE) -//! Round frame size up to this interval (8 pixels) #define CONF_WINDOW_PAD_IN_PIXELS ((1 << MIN_SIZE)<<1) //! spec: CtbSizeY @@ -259,6 +258,9 @@ typedef int32_t mv_t; * */ #define IBC_MRG_MAX_NUM_CANDS 6 +#define IBC_BUFFER_SIZE (256*128) +#define IBC_BUFFER_WIDTH (IBC_BUFFER_SIZE / LCU_WIDTH) +#define IBC_BUFFER_WIDTH_C ((IBC_BUFFER_SIZE / LCU_WIDTH) >> 1) #define MAX_NUM_HMVP_CANDS 5 diff --git a/src/inter.c b/src/inter.c index f89ddf50..7f4c81bf 100644 --- a/src/inter.c +++ b/src/inter.c @@ -626,49 +626,63 @@ void uvg_inter_pred_pu(const encoder_state_t * const state, const int pu_h = PU_GET_H(cu->part_size, width, i_pu); cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); - if (pu->inter.mv_dir == 3) { - const uvg_picture *const refs[2] = { - state->frame->ref->images[ - state->frame->ref_LX[0][ - pu->inter.mv_ref[0]]], - state->frame->ref->images[ - state->frame->ref_LX[1][ - pu->inter.mv_ref[1]]], - }; - uvg_inter_recon_bipred(state, - refs[0], refs[1], - pu_x, pu_y, - pu_w, pu_h, - pu->inter.mv, - lcu, - predict_luma, predict_chroma); + if (pu->type == CU_IBC) { + const int offset = x_scu + y_scu * LCU_WIDTH; + const int offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + uvg_pixels_blit(lcu->rec.y + offset, lcu->rec.y + offset, width, width, LCU_WIDTH, LCU_WIDTH); + uvg_pixels_blit(lcu->rec.u + offset_c, lcu->rec.joint_u + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(lcu->rec.v + offset_c, lcu->rec.joint_v + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + } else { + + if (pu->inter.mv_dir == 3) { + const uvg_picture * const refs[2] = { + state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]], + state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]], + }; + uvg_inter_recon_bipred( + state, + refs[0], + refs[1], + pu_x, + pu_y, + pu_w, + pu_h, + pu->inter.mv, + lcu, + predict_luma, + predict_chroma); + } else { + const int mv_idx = pu->inter.mv_dir - 1; + const uvg_picture * const ref = + (cu->type == CU_IBC) ? + state->tile->frame->rec : + (state->frame->ref + ->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]]); + + const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x); + const unsigned offset_chroma = + SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2; + yuv_t lcu_adapter; + lcu_adapter.size = pu_w * pu_h; + lcu_adapter.y = lcu->rec.y + offset_luma, + lcu_adapter.u = lcu->rec.u + offset_chroma, + lcu_adapter.v = lcu->rec.v + offset_chroma, + + inter_recon_unipred( + state, + ref, + pu_x, + pu_y, + pu_w, + pu_h, + LCU_WIDTH, + pu->inter.mv[mv_idx], + &lcu_adapter, + NULL, + predict_luma, + predict_chroma); + } } - else { - const int mv_idx = pu->inter.mv_dir - 1; - const uvg_picture *const ref = - state->frame->ref->images[ - state->frame->ref_LX[mv_idx][ - pu->inter.mv_ref[mv_idx]]]; - - const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x); - const unsigned offset_chroma = SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2; - yuv_t lcu_adapter; - lcu_adapter.size = pu_w * pu_h; - lcu_adapter.y = lcu->rec.y + offset_luma, - lcu_adapter.u = lcu->rec.u + offset_chroma, - lcu_adapter.v = lcu->rec.v + offset_chroma, - - inter_recon_unipred(state, - ref, - pu_x, pu_y, - pu_w, pu_h, - LCU_WIDTH, - pu->inter.mv[mv_idx], - &lcu_adapter, - NULL, - predict_luma, predict_chroma); - } - if (predict_chroma && state->encoder_control->cfg.jccr) { const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); diff --git a/src/search.c b/src/search.c index 2e594126..59c99473 100644 --- a/src/search.c +++ b/src/search.c @@ -179,7 +179,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in } } -static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width) +static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width, uint8_t type) { const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size; const int num_pu = uvg_part_mode_num_parts[part_mode]; @@ -191,7 +191,7 @@ static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width) const int height_pu = PU_GET_H(part_mode, cu_width, i); cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); - pu->type = CU_INTER; + pu->type = type; lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu); } } @@ -1034,7 +1034,7 @@ static double search_cu( lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); - } else if (cur_cu->type == CU_INTER) { + } else if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { if (!cur_cu->skipped) { @@ -1080,7 +1080,7 @@ static double search_cu( inter_bitcost += cur_cu->merge_idx; } } - lcu_fill_inter(lcu, x_local, y_local, cu_width); + lcu_fill_inter(lcu, x_local, y_local, cu_width, cur_cu->type); lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } } diff --git a/src/videoframe.h b/src/videoframe.h index e1a82181..54f17689 100644 --- a/src/videoframe.h +++ b/src/videoframe.h @@ -78,6 +78,9 @@ typedef struct videoframe int32_t poc; //!< \brief Picture order count cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row + uvg_pixel **ibc_buffer_y; //!< \brief Intra Block Copy buffer for each LCU row + uvg_pixel **ibc_buffer_u; //!< \brief Intra Block Copy buffer for each LCU row + uvg_pixel **ibc_buffer_v; //!< \brief Intra Block Copy buffer for each LCU row uint8_t* hmvp_size; //!< \brief HMVP LUT size bool source_lmcs_mapped; //!< \brief Indicate if source_lmcs is available and mapped to LMCS From 6f19f9798721ab5a145c6d91e2d8c5c5137b7a7e Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 29 Jun 2022 09:00:41 +0300 Subject: [PATCH 04/36] [CI] Add IBC test to CI --- tests/test_tools.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tools.sh b/tests/test_tools.sh index 398ef2e2..5f1f3aeb 100755 --- a/tests/test_tools.sh +++ b/tests/test_tools.sh @@ -14,4 +14,5 @@ valgrind_test $common_args --gop=8 --subme=4 --bipred --tmvp valgrind_test $common_args --transform-skip --tr-skip-max-size=5 valgrind_test $common_args --vaq=8 valgrind_test $common_args --vaq=8 --bitrate 350000 -valgrind_test $common_args --vaq=8 --rc-algorithm oba --bitrate 350000 \ No newline at end of file +valgrind_test $common_args --vaq=8 --rc-algorithm oba --bitrate 350000 +valgrind_test $common_args --ibc=1 \ No newline at end of file From dbc2006ba9aef71b8c040559cefca47cfe7279b0 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 29 Jun 2022 17:05:01 +0300 Subject: [PATCH 05/36] [ibc] Implement IBC reconstruction function when blocks are completely in the ibc buffer --- src/inter.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/src/inter.c b/src/inter.c index 7f4c81bf..4bbef292 100644 --- a/src/inter.c +++ b/src/inter.c @@ -593,6 +593,39 @@ void uvg_inter_recon_cu(const encoder_state_t * const state, } } +static void ibc_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width, + bool predict_luma, + bool predict_chroma, + int i_pu) +{ + const int x_scu = SUB_SCU(x); + const int y_scu = SUB_SCU(y); + const int offset = x_scu + y_scu * LCU_WIDTH; + const int offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); + + int32_t mv_x = cu->inter.mv[0][0] >> UVG_IMV_4PEL; + int32_t mv_y = cu->inter.mv[0][1] >> UVG_IMV_4PEL; + uint32_t ibc_row = y / LCU_WIDTH; + + int32_t buffer_x = ((x - x_scu) + LCU_WIDTH < IBC_BUFFER_WIDTH ? + x : + x - (((x - x_scu) + LCU_WIDTH) - IBC_BUFFER_WIDTH)) + mv_x; + int32_t buffer_y = y_scu + mv_y; + + // Predicted block completely outside of this LCU + if (mv_x + x_scu + width < 0) { + uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + } else if (mv_x + x_scu + width >= width) { // Completely in current LCU + } +} + /** * Predict a single PU. * @@ -627,11 +660,7 @@ void uvg_inter_pred_pu(const encoder_state_t * const state, cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); if (pu->type == CU_IBC) { - const int offset = x_scu + y_scu * LCU_WIDTH; - const int offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; - uvg_pixels_blit(lcu->rec.y + offset, lcu->rec.y + offset, width, width, LCU_WIDTH, LCU_WIDTH); - uvg_pixels_blit(lcu->rec.u + offset_c, lcu->rec.joint_u + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(lcu->rec.v + offset_c, lcu->rec.joint_v + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu); } else { if (pu->inter.mv_dir == 3) { From a46a4531a3cb8f237a9c2adbef4d49ef241273c7 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 30 Jun 2022 14:26:28 +0300 Subject: [PATCH 06/36] [ibc] Add HMVP for IBC and correct AMVP selection --- src/encoder_state-ctors_dtors.c | 7 + src/encoderstate.c | 7 + src/global.h | 6 +- src/inter.c | 278 +++++++++++++++++++++++++++++--- src/search.c | 21 ++- src/videoframe.h | 5 +- 6 files changed, 294 insertions(+), 30 deletions(-) diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c index 037f61d8..e2b55ada 100644 --- a/src/encoder_state-ctors_dtors.c +++ b/src/encoder_state-ctors_dtors.c @@ -122,6 +122,10 @@ static int encoder_state_config_tile_init(encoder_state_t * const state, state->tile->frame->hmvp_lut = malloc(sizeof(cu_info_t) * height_in_lcu * MAX_NUM_HMVP_CANDS); state->tile->frame->hmvp_size = calloc(1, sizeof(uint8_t) * height_in_lcu); + // Allocate the HMVP for IBC in any case + state->tile->frame->hmvp_lut_ibc = malloc(sizeof(cu_info_t) * height_in_lcu * MAX_NUM_HMVP_CANDS); + state->tile->frame->hmvp_size_ibc = calloc(1, sizeof(uint8_t) * height_in_lcu); + if (state->encoder_control->cfg.ibc) { // Allocate pixel buffer for each LCU row state->tile->frame->ibc_buffer_y = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); @@ -209,6 +213,9 @@ static void encoder_state_config_tile_finalize(encoder_state_t * const state) { FREE_POINTER(state->tile->frame->hmvp_lut); FREE_POINTER(state->tile->frame->hmvp_size); + FREE_POINTER(state->tile->frame->hmvp_lut_ibc); + FREE_POINTER(state->tile->frame->hmvp_size_ibc); + if (state->encoder_control->cfg.ibc) { for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { FREE_POINTER(state->tile->frame->ibc_buffer_y[i]); diff --git a/src/encoderstate.c b/src/encoderstate.c index bee55980..7bb12de8 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -744,9 +744,12 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) cu_info_t original_lut[MAX_NUM_HMVP_CANDS]; uint8_t original_lut_size = state->tile->frame->hmvp_size[ctu_row]; + cu_info_t original_lut_ibc[MAX_NUM_HMVP_CANDS]; + uint8_t original_lut_size_ibc = state->tile->frame->hmvp_size_ibc[ctu_row]; // Store original HMVP lut before search and restore after, since it's modified if(state->frame->slicetype != UVG_SLICE_I) memcpy(original_lut, &state->tile->frame->hmvp_lut[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); + if(state->encoder_control->cfg.ibc) memcpy(original_lut_ibc, &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); //This part doesn't write to bitstream, it's only search, deblock and sao uvg_search_lcu(state, lcu->position_px.x, lcu->position_px.y, state->tile->hor_buf_search, state->tile->ver_buf_search, lcu->coeff); @@ -755,6 +758,10 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) memcpy(&state->tile->frame->hmvp_lut[ctu_row_mul_five], original_lut, sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); state->tile->frame->hmvp_size[ctu_row] = original_lut_size; } + if (state->encoder_control->cfg.ibc) { + memcpy(&state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], original_lut_ibc, sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); + state->tile->frame->hmvp_size_ibc[ctu_row] = original_lut_size_ibc; + } encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search); diff --git a/src/global.h b/src/global.h index 773f9c15..4dab59a0 100644 --- a/src/global.h +++ b/src/global.h @@ -128,9 +128,9 @@ typedef int16_t coeff_t; typedef int32_t mv_t; -//#define VERBOSE 1 -//#define UVG_DEBUG_PRINT_CABAC 1 -//#define UVG_DEBUG 1 +#define VERBOSE 1 +#define UVG_DEBUG_PRINT_CABAC 1 +#define UVG_DEBUG 1 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1 //#define UVG_DEBUG_PRINT_MV_INFO 1 diff --git a/src/inter.c b/src/inter.c index 4bbef292..be4a7923 100644 --- a/src/inter.c +++ b/src/inter.c @@ -617,12 +617,28 @@ static void ibc_recon_cu(const encoder_state_t * const state, x - (((x - x_scu) + LCU_WIDTH) - IBC_BUFFER_WIDTH)) + mv_x; int32_t buffer_y = y_scu + mv_y; + // The whole block must fir to the left of the current position + assert(-mv_x >= width); + // Predicted block completely outside of this LCU - if (mv_x + x_scu + width < 0) { - uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); - uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + if (mv_x + x_scu + width <= 0) { + if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); + if (predict_chroma) { + uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + } } else if (mv_x + x_scu + width >= width) { // Completely in current LCU + if(predict_luma) uvg_pixels_blit(&lcu->rec.y[(y_scu + mv_y) * LCU_WIDTH + x_scu + mv_x], lcu->rec.y + offset, width, width, LCU_WIDTH, LCU_WIDTH); + if (predict_chroma) { + uvg_pixels_blit(&lcu->rec.u[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x) / 2], lcu->rec.u + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&lcu->rec.v[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x) / 2], lcu->rec.v + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + } + } else { // Partly on the buffer and party on the current LCU rec + if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); + if (predict_chroma) { + uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + } } } @@ -960,6 +976,74 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, } } + +/** + * \brief Get merge candidates for current block. + * + * The output parameters b0, b1, b2, a0, a1 are pointed to the + * corresponding cu_info_t struct in lcu->cu, or set to NULL, if the + * candidate is not available. + * + * \param x block x position in pixels + * \param y block y position in pixels + * \param width block width in pixels + * \param height block height in pixels + * \param picture_width tile width in pixels + * \param picture_height tile height in pixels + * \param lcu current LCU + * \param cand_out will be filled with A and B candidates + */ +static void get_ibc_merge_candidates(int32_t x, + int32_t y, + int32_t width, + int32_t height, + int32_t picture_width, + int32_t picture_height, + lcu_t *lcu, + merge_candidates_t *cand_out, + uint8_t parallel_merge_level, + bool wpp + ) +{ + /* + Predictor block locations + ____ _______ + |B2|______|B1|B0| + | | + | Cur CU | + __| | + |A1|_________| + |A0| + */ + int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU + int32_t y_local = SUB_SCU(y); + // A0 and A1 availability testing + if (x != 0) { + cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1); + // Do not check a1->coded because the block above is always coded before + // the current one and the flag is not set when searching an SMP block. + if (a1->type == CU_IBC) { + inter_clear_cu_unused(a1); + cand_out->a[1] = a1; + cand_out->mer_a1[0] = parallel_merge_level; + } + } + + // B0, B1 and B2 availability testing + if (y != 0) { + + cu_info_t *b1 = LCU_GET_CU_AT_PX(lcu, x_local + width - 1, y_local - 1); + // Do not check b1->coded because the block to the left is always coded + // before the current one and the flag is not set when searching an SMP + // block. + if (b1->type == CU_IBC) { + inter_clear_cu_unused(b1); + cand_out->b[1] = b1; + } + } +} + + /** * \brief Get merge candidates for current block. * @@ -1056,6 +1140,65 @@ static void get_spatial_merge_candidates(int32_t x, } } + +/** + * \brief Get merge candidates for current block. + * + * The output parameters b0, b1, b2, a0, a1 are pointed to the + * corresponding cu_info_t struct in lcu->cu, or set to NULL, if the + * candidate is not available. + * + * \param cua cu information + * \param x block x position in pixels + * \param y block y position in pixels + * \param width block width in pixels + * \param height block height in pixels + * \param picture_width tile width in pixels + * \param picture_height tile height in pixels + * \param cand_out will be filled with A and B candidates + */ +static void get_ibc_merge_candidates_cua(const cu_array_t *cua, + int32_t x, + int32_t y, + int32_t width, + int32_t height, + int32_t picture_width, + int32_t picture_height, + merge_candidates_t *cand_out, + bool wpp) +{ + /* + Predictor block locations + ____ _______ + |B2|______|B1|B0| + | | + | Cur CU | + __| | + |A1|_________| + |A0| + */ + int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU + int32_t y_local = SUB_SCU(y); + // A0 and A1 availability testing + if (x != 0) { + const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1); + // The block above is always coded before the current one. + if (a1->type == CU_IBC) { + cand_out->a[1] = a1; + } + } + + // B1 availability testing + if (y != 0) { + + const cu_info_t* b1 = uvg_cu_array_at_const(cua, x + width - 1, y - 1); + // The block to the left is always coded before the current one. + if (b1->type == CU_IBC) { + cand_out->b[1] = b1; + } + } +} + /** * \brief Get merge candidates for current block. * @@ -1425,6 +1568,71 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, } } + +/** + * \brief Pick two mv candidates from the spatial and temporal candidates. + */ +static void get_ibc_mv_cand_from_candidates(const encoder_state_t * const state, + int32_t x, + int32_t y, + int32_t width, + int32_t height, + const merge_candidates_t *merge_cand, + const cu_info_t * const cur_cu, + int8_t reflist, + mv_t mv_cand[2][2]) +{ + const cu_info_t *const *a = merge_cand->a; + const cu_info_t *const *b = merge_cand->b; + + uint8_t candidates = 0; + uint8_t b_candidates = 0; + + // Left predictors without scaling + if (add_mvp_candidate(state, cur_cu, a[1], reflist, false, mv_cand[candidates])) { + candidates++; + } + + + // Top predictors without scaling + if (add_mvp_candidate(state, cur_cu, b[1], reflist, false, mv_cand[candidates])) { + b_candidates++; + } + + candidates += b_candidates; + + if (candidates > 0) + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); + if (candidates > 1) + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); + + // Remove identical candidate + if (candidates == 2 && mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]) { + candidates = 1; + } + + if (candidates < AMVP_MAX_NUM_CANDS) + { + const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); + const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; + int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row]; + for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) { + cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + num_cand - 1 - i]; + mv_cand[candidates][0] = cand->inter.mv[0][0]; + mv_cand[candidates][1] = cand->inter.mv[0][1]; + candidates++; + if (candidates == AMVP_MAX_NUM_CANDS) return; + } + } + + // Fill with (0,0) + while (candidates < AMVP_MAX_NUM_CANDS) { + mv_cand[candidates][0] = 0; + mv_cand[candidates][1] = 0; + candidates++; + } +} + /** * \brief Get MV prediction for current block. * @@ -1450,14 +1658,22 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, { merge_candidates_t merge_cand = { 0 }; const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; - get_spatial_merge_candidates(x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - lcu, - &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp); - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); - get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); - + if (cur_cu->type == CU_IBC) { + get_ibc_merge_candidates(x, y, width, height, + state->tile->frame->width, + state->tile->frame->height, + lcu, + &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp); + get_ibc_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + } else { + get_spatial_merge_candidates(x, y, width, height, + state->tile->frame->width, + state->tile->frame->height, + lcu, + &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp); + get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); + get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + } uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } @@ -1486,13 +1702,20 @@ void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, merge_candidates_t merge_cand = { 0 }; const cu_array_t *cua = state->tile->frame->cu_array; - get_spatial_merge_candidates_cua(cua, - x, y, width, height, - state->tile->frame->width, state->tile->frame->height, - &merge_cand, state->encoder_control->cfg.wpp); - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); - get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); - + if (cur_cu->type == CU_IBC) { + get_ibc_merge_candidates_cua(cua,x, y, width, height, + state->tile->frame->width, + state->tile->frame->height, + &merge_cand, state->encoder_control->cfg.wpp); + get_ibc_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + } else { + get_spatial_merge_candidates_cua(cua, + x, y, width, height, + state->tile->frame->width, state->tile->frame->height, + &merge_cand, state->encoder_control->cfg.wpp); + get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); + get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + } uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } @@ -1577,21 +1800,28 @@ static bool hmvp_push_lut_item(cu_info_t* lut, int32_t size, const cu_info_t* cu void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu) { //if (!cu.geoFlag && !cu.affine) - if(cu->type == CU_INTER) + if(cu->type != CU_INTRA) { const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; const uint32_t xBr = block_width + pic_x; const uint32_t yBr = block_height + pic_y; bool hmvp_possible = ((xBr >> parallel_merge_level) > (pic_x >> parallel_merge_level)) && ((yBr >> parallel_merge_level) > (pic_y >> parallel_merge_level)); - if (hmvp_possible) { // ToDo: check for IBC + if (hmvp_possible || cu->type == CU_IBC) { const uint32_t ctu_row = (pic_y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; - bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut[ctu_row_mul_five], state->tile->frame->hmvp_size[ctu_row], cu); - if(add_row && state->tile->frame->hmvp_size[ctu_row] < MAX_NUM_HMVP_CANDS) { - state->tile->frame->hmvp_size[ctu_row]++; + if (cu->type == CU_IBC) { + bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], state->tile->frame->hmvp_size_ibc[ctu_row], cu); + if(add_row && state->tile->frame->hmvp_size_ibc[ctu_row] < MAX_NUM_HMVP_CANDS) { + state->tile->frame->hmvp_size_ibc[ctu_row]++; + } + } else { + bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut[ctu_row_mul_five], state->tile->frame->hmvp_size[ctu_row], cu); + if(add_row && state->tile->frame->hmvp_size[ctu_row] < MAX_NUM_HMVP_CANDS) { + state->tile->frame->hmvp_size[ctu_row]++; + } } } } diff --git a/src/search.c b/src/search.c index 59c99473..3fefd1c2 100644 --- a/src/search.c +++ b/src/search.c @@ -803,9 +803,12 @@ static double search_cu( cu_info_t hmvp_lut[MAX_NUM_HMVP_CANDS]; uint8_t hmvp_lut_size = state->tile->frame->hmvp_size[ctu_row]; + cu_info_t hmvp_lut_ibc[MAX_NUM_HMVP_CANDS]; + uint8_t hmvp_lut_size_ibc = state->tile->frame->hmvp_size_ibc[ctu_row]; // Store original HMVP lut before search and restore after, since it's modified if (state->frame->slicetype != UVG_SLICE_I) memcpy(hmvp_lut, &state->tile->frame->hmvp_lut[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); + if(state->encoder_control->cfg.ibc) memcpy(hmvp_lut_ibc, &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); struct { int32_t min; @@ -1288,7 +1291,14 @@ static double search_cu( if (state->frame->slicetype != UVG_SLICE_I) { // Reset HMVP to the beginning of this CU level search and add this CU as the mvp memcpy(&state->tile->frame->hmvp_lut[ctu_row_mul_five], hmvp_lut, sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); - state->tile->frame->hmvp_size[ctu_row] = hmvp_lut_size; + state->tile->frame->hmvp_size[ctu_row] = hmvp_lut_size; + } + if (state->encoder_control->cfg.ibc) { + memcpy(&state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], hmvp_lut_ibc, sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); + state->tile->frame->hmvp_size_ibc[ctu_row] = hmvp_lut_size_ibc; + } + // Add candidate when in inter slice or ibc is enabled + if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) { uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu); } } @@ -1310,7 +1320,14 @@ static double search_cu( if (state->frame->slicetype != UVG_SLICE_I) { // Reset HMVP to the beginning of this CU level search and add this CU as the mvp memcpy(&state->tile->frame->hmvp_lut[ctu_row_mul_five], hmvp_lut, sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); - state->tile->frame->hmvp_size[ctu_row] = hmvp_lut_size; + state->tile->frame->hmvp_size[ctu_row] = hmvp_lut_size; + } + if (state->encoder_control->cfg.ibc) { + memcpy(&state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], hmvp_lut_ibc, sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); + state->tile->frame->hmvp_size_ibc[ctu_row] = hmvp_lut_size_ibc; + } + // Add candidate when in inter slice or ibc is enabled + if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) { uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu); } } diff --git a/src/videoframe.h b/src/videoframe.h index 54f17689..2e6bb8fb 100644 --- a/src/videoframe.h +++ b/src/videoframe.h @@ -77,11 +77,14 @@ typedef struct videoframe struct param_set_map* alf_param_set_map; int32_t poc; //!< \brief Picture order count - cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row + uvg_pixel **ibc_buffer_y; //!< \brief Intra Block Copy buffer for each LCU row uvg_pixel **ibc_buffer_u; //!< \brief Intra Block Copy buffer for each LCU row uvg_pixel **ibc_buffer_v; //!< \brief Intra Block Copy buffer for each LCU row + cu_info_t* hmvp_lut_ibc; //!< \brief Look-up table for HMVP in IBC, one for each LCU row + uint8_t* hmvp_size_ibc; //!< \brief HMVP IBC LUT size + cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row uint8_t* hmvp_size; //!< \brief HMVP LUT size bool source_lmcs_mapped; //!< \brief Indicate if source_lmcs is available and mapped to LMCS bool lmcs_top_level; //!< \brief Indicate that in this level the LMCS images are allocated From d9164f3cfe0af4e018f36b9efb7c6f86c5edc309 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 1 Jul 2022 06:37:19 +0300 Subject: [PATCH 07/36] [ibc] Simplify the IBC merge candidate and mv cand selection --- src/inter.c | 478 ++++++++++++++++++++++------------------------------ 1 file changed, 206 insertions(+), 272 deletions(-) diff --git a/src/inter.c b/src/inter.c index be4a7923..5fd8c21c 100644 --- a/src/inter.c +++ b/src/inter.c @@ -604,8 +604,8 @@ static void ibc_recon_cu(const encoder_state_t * const state, { const int x_scu = SUB_SCU(x); const int y_scu = SUB_SCU(y); - const int offset = x_scu + y_scu * LCU_WIDTH; - const int offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + uint32_t offset = x_scu + y_scu * LCU_WIDTH; + uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); int32_t mv_x = cu->inter.mv[0][0] >> UVG_IMV_4PEL; @@ -623,7 +623,7 @@ static void ibc_recon_cu(const encoder_state_t * const state, // Predicted block completely outside of this LCU if (mv_x + x_scu + width <= 0) { if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); - if (predict_chroma) { + if (predict_chroma) { uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); } @@ -634,10 +634,22 @@ static void ibc_recon_cu(const encoder_state_t * const state, uvg_pixels_blit(&lcu->rec.v[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x) / 2], lcu->rec.v + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); } } else { // Partly on the buffer and party on the current LCU rec - if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); + + uint32_t width_buffer = -(mv_x + x_scu); + uint32_t width_lcu = width - width_buffer; + if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width_buffer, width, IBC_BUFFER_WIDTH, LCU_WIDTH); if (predict_chroma) { - uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width_buffer / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width_buffer / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + } + + offset += width_buffer; + offset_c += width_buffer/2; + + if(predict_luma) uvg_pixels_blit(&lcu->rec.y[(y_scu + mv_y) * LCU_WIDTH + x_scu + mv_x + width_buffer], lcu->rec.y + offset, width_lcu, width, LCU_WIDTH, LCU_WIDTH); + if (predict_chroma) { + uvg_pixels_blit(&lcu->rec.u[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x + width_buffer) / 2], lcu->rec.u + offset_c, width_lcu / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&lcu->rec.v[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x + width_buffer) / 2], lcu->rec.v + offset_c, width_lcu / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); } } } @@ -977,6 +989,128 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, } +static INLINE int16_t get_scaled_mv(int16_t mv, int scale) +{ + int32_t scaled = scale * mv; + return CLIP(-32768, 32767, (scaled + 127 + (scaled < 0)) >> 8); +} + +#define MV_EXPONENT_BITCOUNT 4 +#define MV_MANTISSA_BITCOUNT 6 +#define MV_MANTISSA_UPPER_LIMIT ((1 << (MV_MANTISSA_BITCOUNT - 1)) - 1) +#define MV_MANTISSA_LIMIT (1 << (MV_MANTISSA_BITCOUNT - 1)) +#define MV_EXPONENT_MASK ((1 << MV_EXPONENT_BITCOUNT) - 1) + +static int convert_mv_fixed_to_float(int32_t val) +{ + uint32_t sign = val >> 31; + int scale = uvg_math_floor_log2((val ^ sign) | MV_MANTISSA_UPPER_LIMIT) - (MV_MANTISSA_BITCOUNT - 1); + + int exponent; + uint32_t mantissa; + if (scale >= 0) + { + int round = (1 << scale) >> 1; + int n = (val + round) >> scale; + exponent = scale + ((n ^ sign) >> (MV_MANTISSA_BITCOUNT - 1)); + mantissa = (n & MV_MANTISSA_UPPER_LIMIT) | (sign << (MV_MANTISSA_BITCOUNT - 1)); + } + else + { + exponent = 0; + mantissa = val; + } + + return exponent | (mantissa << MV_EXPONENT_BITCOUNT); +} + +static int convert_mv_float_to_fixed(int val) +{ + int exponent = val & MV_EXPONENT_MASK; + uint32_t mantissa = val >> MV_EXPONENT_BITCOUNT; + return exponent == 0 ? mantissa : (mantissa ^ MV_MANTISSA_LIMIT) << (exponent - 1); +} + +static int round_mv_comp(int x) +{ + return convert_mv_float_to_fixed(convert_mv_fixed_to_float(x)); +} + +static void apply_mv_scaling_pocs(int32_t current_poc, + int32_t current_ref_poc, + int32_t neighbor_poc, + int32_t neighbor_ref_poc, + mv_t mv_cand[2]) +{ + int32_t diff_current = current_poc - current_ref_poc; + int32_t diff_neighbor = neighbor_poc - neighbor_ref_poc; + + if (diff_current == diff_neighbor) return; + + diff_current = CLIP(-128, 127, diff_current); + diff_neighbor = CLIP(-128, 127, diff_neighbor); + + int scale = CLIP(-4096, 4095, + (diff_current * ((0x4000 + (abs(diff_neighbor) >> 1)) / diff_neighbor) + 32) >> 6); + + mv_cand[0] = get_scaled_mv(mv_cand[0], scale); + mv_cand[1] = get_scaled_mv(mv_cand[1], scale); +} + +static INLINE void apply_mv_scaling(const encoder_state_t *state, + const cu_info_t *current_cu, + const cu_info_t *neighbor_cu, + int8_t current_reflist, + int8_t neighbor_reflist, + mv_t mv_cand[2]) +{ + apply_mv_scaling_pocs(state->frame->poc, + state->frame->ref->pocs[ + state->frame->ref_LX[current_reflist][ + current_cu->inter.mv_ref[current_reflist]]], + state->frame->poc, + state->frame->ref->pocs[ + state->frame->ref_LX[neighbor_reflist][ + neighbor_cu->inter.mv_ref[neighbor_reflist]]], + mv_cand); +} + +static INLINE bool add_mvp_candidate(const encoder_state_t *state, + const cu_info_t *cur_cu, + const cu_info_t *cand, + int8_t reflist, + bool scaling, + mv_t mv_cand_out[2]) +{ + if (!cand) return false; + + assert(cand->inter.mv_dir != 0); + + for (int i = 0; i < 2; i++) { + const int cand_list = i == 0 ? reflist : !reflist; + + if ((cand->inter.mv_dir & (1 << cand_list)) == 0) continue; + + if (scaling) { + mv_cand_out[0] = cand->inter.mv[cand_list][0]; + mv_cand_out[1] = cand->inter.mv[cand_list][1]; + apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out); + return true; + } + + if (state->frame->ref_LX[cand_list][cand->inter.mv_ref[cand_list]] == + state->frame->ref_LX[reflist][cur_cu->inter.mv_ref[reflist]]) + { + mv_cand_out[0] = cand->inter.mv[cand_list][0]; + mv_cand_out[1] = cand->inter.mv[cand_list][1]; + return true; + } + } + + return false; +} + + /** * \brief Get merge candidates for current block. * @@ -993,16 +1127,15 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, * \param lcu current LCU * \param cand_out will be filled with A and B candidates */ -static void get_ibc_merge_candidates(int32_t x, +static void get_ibc_merge_candidates(const encoder_state_t * const state, + const cu_info_t * const cur_cu, + lcu_t *lcu, + const cu_array_t *cua, + int32_t x, int32_t y, int32_t width, int32_t height, - int32_t picture_width, - int32_t picture_height, - lcu_t *lcu, - merge_candidates_t *cand_out, - uint8_t parallel_merge_level, - bool wpp + mv_t mv_cand[IBC_MRG_MAX_NUM_CANDS][2] ) { /* @@ -1017,29 +1150,72 @@ static void get_ibc_merge_candidates(int32_t x, */ int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU int32_t y_local = SUB_SCU(y); - // A0 and A1 availability testing + + cu_info_t *a1 = NULL; + cu_info_t *b1 = NULL; + + // A1 availability testing if (x != 0) { - cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1); + a1 = lcu != NULL?LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1): uvg_cu_array_at_const(cua, x - 1, y + height - 1); // Do not check a1->coded because the block above is always coded before // the current one and the flag is not set when searching an SMP block. if (a1->type == CU_IBC) { inter_clear_cu_unused(a1); - cand_out->a[1] = a1; - cand_out->mer_a1[0] = parallel_merge_level; + } else { + a1 = NULL; } } - // B0, B1 and B2 availability testing + // B1 availability testing if (y != 0) { - - cu_info_t *b1 = LCU_GET_CU_AT_PX(lcu, x_local + width - 1, y_local - 1); + b1 = lcu != NULL?LCU_GET_CU_AT_PX(lcu, x_local + width - 1, y_local - 1): uvg_cu_array_at_const(cua, x + width - 1, y - 1); // Do not check b1->coded because the block to the left is always coded // before the current one and the flag is not set when searching an SMP // block. if (b1->type == CU_IBC) { - inter_clear_cu_unused(b1); - cand_out->b[1] = b1; + inter_clear_cu_unused(b1); + } else { + b1 = NULL; } + } + + uint8_t candidates = 0; + + // Left predictors without scaling + if (add_mvp_candidate(state, cur_cu, a1, 0, false, mv_cand[candidates])) { + candidates++; + } + + // Top predictors without scaling + if (add_mvp_candidate(state, cur_cu, b1, 0, false, mv_cand[candidates])) { + candidates++; + } + + + if (candidates > 0) + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); + if (candidates > 1) + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); + + if (candidates < IBC_MRG_MAX_NUM_CANDS) + { + const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); + const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; + int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row]; + for (int i = 0; i < MIN(MAX_NUM_HMVP_CANDS,num_cand); i++) { + cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + num_cand - 1 - i]; + mv_cand[candidates][0] = cand->inter.mv[0][0]; + mv_cand[candidates][1] = cand->inter.mv[0][1]; + candidates++; + if (candidates == IBC_MRG_MAX_NUM_CANDS) return; + } + } + + // Fill with (0,0) + while (candidates < IBC_MRG_MAX_NUM_CANDS) { + mv_cand[candidates][0] = 0; + mv_cand[candidates][1] = 0; + candidates++; } } @@ -1140,65 +1316,6 @@ static void get_spatial_merge_candidates(int32_t x, } } - -/** - * \brief Get merge candidates for current block. - * - * The output parameters b0, b1, b2, a0, a1 are pointed to the - * corresponding cu_info_t struct in lcu->cu, or set to NULL, if the - * candidate is not available. - * - * \param cua cu information - * \param x block x position in pixels - * \param y block y position in pixels - * \param width block width in pixels - * \param height block height in pixels - * \param picture_width tile width in pixels - * \param picture_height tile height in pixels - * \param cand_out will be filled with A and B candidates - */ -static void get_ibc_merge_candidates_cua(const cu_array_t *cua, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - int32_t picture_width, - int32_t picture_height, - merge_candidates_t *cand_out, - bool wpp) -{ - /* - Predictor block locations - ____ _______ - |B2|______|B1|B0| - | | - | Cur CU | - __| | - |A1|_________| - |A0| - */ - int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU - int32_t y_local = SUB_SCU(y); - // A0 and A1 availability testing - if (x != 0) { - const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1); - // The block above is always coded before the current one. - if (a1->type == CU_IBC) { - cand_out->a[1] = a1; - } - } - - // B1 availability testing - if (y != 0) { - - const cu_info_t* b1 = uvg_cu_array_at_const(cua, x + width - 1, y - 1); - // The block to the left is always coded before the current one. - if (b1->type == CU_IBC) { - cand_out->b[1] = b1; - } - } -} - /** * \brief Get merge candidates for current block. * @@ -1285,86 +1402,6 @@ static INLINE mv_t get_scaled_mv(mv_t mv, int scale) return CLIP(-131072, 131071, (scaled + 127 + (scaled < 0)) >> 8); } -#define MV_EXPONENT_BITCOUNT 4 -#define MV_MANTISSA_BITCOUNT 6 -#define MV_MANTISSA_UPPER_LIMIT ((1 << (MV_MANTISSA_BITCOUNT - 1)) - 1) -#define MV_MANTISSA_LIMIT (1 << (MV_MANTISSA_BITCOUNT - 1)) -#define MV_EXPONENT_MASK ((1 << MV_EXPONENT_BITCOUNT) - 1) - -static int convert_mv_fixed_to_float(int32_t val) -{ - uint32_t sign = val >> 31; - int scale = uvg_math_floor_log2((val ^ sign) | MV_MANTISSA_UPPER_LIMIT) - (MV_MANTISSA_BITCOUNT - 1); - - int exponent; - uint32_t mantissa; - if (scale >= 0) - { - int round = (1 << scale) >> 1; - int n = (val + round) >> scale; - exponent = scale + ((n ^ sign) >> (MV_MANTISSA_BITCOUNT - 1)); - mantissa = (n & MV_MANTISSA_UPPER_LIMIT) | (sign << (MV_MANTISSA_BITCOUNT - 1)); - } - else - { - exponent = 0; - mantissa = val; - } - - return exponent | (mantissa << MV_EXPONENT_BITCOUNT); -} - -static int convert_mv_float_to_fixed(int val) -{ - int exponent = val & MV_EXPONENT_MASK; - uint32_t mantissa = val >> MV_EXPONENT_BITCOUNT; - return exponent == 0 ? mantissa : (mantissa ^ MV_MANTISSA_LIMIT) << (exponent - 1); -} - -static int round_mv_comp(int x) -{ - return convert_mv_float_to_fixed(convert_mv_fixed_to_float(x)); -} - -static void apply_mv_scaling_pocs(int32_t current_poc, - int32_t current_ref_poc, - int32_t neighbor_poc, - int32_t neighbor_ref_poc, - mv_t mv_cand[2]) -{ - int32_t diff_current = current_poc - current_ref_poc; - int32_t diff_neighbor = neighbor_poc - neighbor_ref_poc; - - if (diff_current == diff_neighbor) return; - - diff_current = CLIP(-128, 127, diff_current); - diff_neighbor = CLIP(-128, 127, diff_neighbor); - - int scale = CLIP(-4096, 4095, - (diff_current * ((0x4000 + (abs(diff_neighbor) >> 1)) / diff_neighbor) + 32) >> 6); - - mv_cand[0] = get_scaled_mv(mv_cand[0], scale); - mv_cand[1] = get_scaled_mv(mv_cand[1], scale); -} - -static INLINE void apply_mv_scaling(const encoder_state_t *state, - const cu_info_t *current_cu, - const cu_info_t *neighbor_cu, - int8_t current_reflist, - int8_t neighbor_reflist, - mv_t mv_cand[2]) -{ - apply_mv_scaling_pocs(state->frame->poc, - state->frame->ref->pocs[ - state->frame->ref_LX[current_reflist][ - current_cu->inter.mv_ref[current_reflist]]], - state->frame->poc, - state->frame->ref->pocs[ - state->frame->ref_LX[neighbor_reflist][ - neighbor_cu->inter.mv_ref[neighbor_reflist]]], - mv_cand); -} - /** * \brief Try to add a temporal MVP or merge candidate. * @@ -1432,41 +1469,6 @@ static bool add_temporal_candidate(const encoder_state_t *state, return true; } -static INLINE bool add_mvp_candidate(const encoder_state_t *state, - const cu_info_t *cur_cu, - const cu_info_t *cand, - int8_t reflist, - bool scaling, - mv_t mv_cand_out[2]) -{ - if (!cand) return false; - - assert(cand->inter.mv_dir != 0); - - for (int i = 0; i < 2; i++) { - const int cand_list = i == 0 ? reflist : !reflist; - - if ((cand->inter.mv_dir & (1 << cand_list)) == 0) continue; - - if (scaling) { - mv_cand_out[0] = cand->inter.mv[cand_list][0]; - mv_cand_out[1] = cand->inter.mv[cand_list][1]; - apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out); - return true; - } - - if (state->frame->ref_LX[cand_list][cand->inter.mv_ref[cand_list]] == - state->frame->ref_LX[reflist][cur_cu->inter.mv_ref[reflist]]) - { - mv_cand_out[0] = cand->inter.mv[cand_list][0]; - mv_cand_out[1] = cand->inter.mv[cand_list][1]; - return true; - } - } - - return false; -} - /** * \brief Pick two mv candidates from the spatial and temporal candidates. */ @@ -1568,71 +1570,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, } } - -/** - * \brief Pick two mv candidates from the spatial and temporal candidates. - */ -static void get_ibc_mv_cand_from_candidates(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - const merge_candidates_t *merge_cand, - const cu_info_t * const cur_cu, - int8_t reflist, - mv_t mv_cand[2][2]) -{ - const cu_info_t *const *a = merge_cand->a; - const cu_info_t *const *b = merge_cand->b; - - uint8_t candidates = 0; - uint8_t b_candidates = 0; - - // Left predictors without scaling - if (add_mvp_candidate(state, cur_cu, a[1], reflist, false, mv_cand[candidates])) { - candidates++; - } - - - // Top predictors without scaling - if (add_mvp_candidate(state, cur_cu, b[1], reflist, false, mv_cand[candidates])) { - b_candidates++; - } - - candidates += b_candidates; - - if (candidates > 0) - uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); - if (candidates > 1) - uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); - - // Remove identical candidate - if (candidates == 2 && mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]) { - candidates = 1; - } - - if (candidates < AMVP_MAX_NUM_CANDS) - { - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); - const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; - int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row]; - for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) { - cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + num_cand - 1 - i]; - mv_cand[candidates][0] = cand->inter.mv[0][0]; - mv_cand[candidates][1] = cand->inter.mv[0][1]; - candidates++; - if (candidates == AMVP_MAX_NUM_CANDS) return; - } - } - - // Fill with (0,0) - while (candidates < AMVP_MAX_NUM_CANDS) { - mv_cand[candidates][0] = 0; - mv_cand[candidates][1] = 0; - candidates++; - } -} - /** * \brief Get MV prediction for current block. * @@ -1659,12 +1596,10 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, merge_candidates_t merge_cand = { 0 }; const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; if (cur_cu->type == CU_IBC) { - get_ibc_merge_candidates(x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - lcu, - &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp); - get_ibc_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; + get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); + memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); + memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); } else { get_spatial_merge_candidates(x, y, width, height, state->tile->frame->width, @@ -1703,11 +1638,10 @@ void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, const cu_array_t *cua = state->tile->frame->cu_array; if (cur_cu->type == CU_IBC) { - get_ibc_merge_candidates_cua(cua,x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - &merge_cand, state->encoder_control->cfg.wpp); - get_ibc_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; + get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand); + memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); + memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); } else { get_spatial_merge_candidates_cua(cua, x, y, width, height, From cc4c7576950117e6675df8ecf83b2b3db0857b70 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Sat, 2 Jul 2022 18:18:42 +0300 Subject: [PATCH 08/36] [ibc] Fix bugs on IBC reconstruction and add a simple search for I-frames --- src/debug.c | 2 +- src/encode_coding_tree.c | 2 +- src/encoderstate.c | 2 +- src/inter.c | 22 +++++------ src/search.c | 85 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 99 insertions(+), 14 deletions(-) diff --git a/src/debug.c b/src/debug.c index eed773ee..1a2f00a0 100644 --- a/src/debug.c +++ b/src/debug.c @@ -131,7 +131,7 @@ void uvg_dbg_yuview_init(const encoder_control_t* const encoder, char* filename, fprintf(yuview_output, "%%;scaleFactor;16\r\n"); fprintf(yuview_output, "%%;type;13;MVInterL0;vector\r\n"); fprintf(yuview_output, "%%;vectorColor;0;0;0;255\r\n"); - fprintf(yuview_output, "%%;scaleFactor;16\r\n"); + fprintf(yuview_output, "%%;scaleFactor;4\r\n"); fprintf(yuview_output, "%%;type;14;MVInterL1;vector\r\n"); fprintf(yuview_output, "%%;vectorColor;255;255;255;255\r\n"); fprintf(yuview_output, "%%;scaleFactor;16\r\n"); diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index fa73e08e..88aec44e 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1458,7 +1458,7 @@ void uvg_encode_coding_tree( // CABAC_BIN(cabac, 0, "split_transform_flag"); } - DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, (cur_cu->type == CU_INTRA)?0:1); + DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, cur_cu->type-1); if (ctrl->cfg.lossless) { cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass; diff --git a/src/encoderstate.c b/src/encoderstate.c index 7bb12de8..e6f8546e 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -254,7 +254,7 @@ static void encoder_state_recdata_to_bufs(encoder_state_t * const state, // Fill IBC buffer if (state->encoder_control->cfg.ibc) { - uint32_t ibc_buffer_pos_x = lcu->position_px.x + LCU_WIDTH > IBC_BUFFER_WIDTH ? IBC_BUFFER_WIDTH - LCU_WIDTH: lcu->position_px.x; + uint32_t ibc_buffer_pos_x = lcu->position_px.x + LCU_WIDTH >= IBC_BUFFER_WIDTH ? IBC_BUFFER_WIDTH - LCU_WIDTH: lcu->position_px.x; uint32_t ibc_buffer_pos_x_c = ibc_buffer_pos_x >> 1; uint32_t ibc_buffer_row = lcu->position_px.y / LCU_WIDTH; diff --git a/src/inter.c b/src/inter.c index 5fd8c21c..944f9c47 100644 --- a/src/inter.c +++ b/src/inter.c @@ -612,20 +612,20 @@ static void ibc_recon_cu(const encoder_state_t * const state, int32_t mv_y = cu->inter.mv[0][1] >> UVG_IMV_4PEL; uint32_t ibc_row = y / LCU_WIDTH; - int32_t buffer_x = ((x - x_scu) + LCU_WIDTH < IBC_BUFFER_WIDTH ? + int32_t buffer_x = ((x - x_scu) + LCU_WIDTH <= IBC_BUFFER_WIDTH ? x : - x - (((x - x_scu) + LCU_WIDTH) - IBC_BUFFER_WIDTH)) + mv_x; + x - (((x - x_scu)) - IBC_BUFFER_WIDTH)) + mv_x; int32_t buffer_y = y_scu + mv_y; - // The whole block must fir to the left of the current position + // The whole block must be to the left of the current position assert(-mv_x >= width); // Predicted block completely outside of this LCU if (mv_x + x_scu + width <= 0) { if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH); if (predict_chroma) { - uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); } } else if (mv_x + x_scu + width >= width) { // Completely in current LCU if(predict_luma) uvg_pixels_blit(&lcu->rec.y[(y_scu + mv_y) * LCU_WIDTH + x_scu + mv_x], lcu->rec.y + offset, width, width, LCU_WIDTH, LCU_WIDTH); @@ -639,15 +639,15 @@ static void ibc_recon_cu(const encoder_state_t * const state, uint32_t width_lcu = width - width_buffer; if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width_buffer, width, IBC_BUFFER_WIDTH, LCU_WIDTH); if (predict_chroma) { - uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width_buffer / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width_buffer / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.u + offset_c, width_buffer / 2 + (width_buffer&1), width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.v + offset_c, width_buffer / 2 + (width_buffer&1), width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C); } offset += width_buffer; - offset_c += width_buffer/2; + offset_c += width_buffer/2 + (width_buffer&1); if(predict_luma) uvg_pixels_blit(&lcu->rec.y[(y_scu + mv_y) * LCU_WIDTH + x_scu + mv_x + width_buffer], lcu->rec.y + offset, width_lcu, width, LCU_WIDTH, LCU_WIDTH); - if (predict_chroma) { + if (predict_chroma && (width_lcu / 2)) { uvg_pixels_blit(&lcu->rec.u[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x + width_buffer) / 2], lcu->rec.u + offset_c, width_lcu / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); uvg_pixels_blit(&lcu->rec.v[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x + width_buffer) / 2], lcu->rec.v + offset_c, width_lcu / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); } @@ -1202,8 +1202,8 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row]; - for (int i = 0; i < MIN(MAX_NUM_HMVP_CANDS,num_cand); i++) { - cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + num_cand - 1 - i]; + for (int i = 0; i < MIN(4,num_cand); i++) { + cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + i]; mv_cand[candidates][0] = cand->inter.mv[0][0]; mv_cand[candidates][1] = cand->inter.mv[0][1]; candidates++; diff --git a/src/search.c b/src/search.c index 3fefd1c2..9743905e 100644 --- a/src/search.c +++ b/src/search.c @@ -1008,6 +1008,91 @@ static double search_cu( } } + // Simple IBC search + if (can_use_intra && state->frame->slicetype == UVG_SLICE_I + && state->encoder_control->cfg.ibc) { + cu_info_t cu_backup = *cur_cu; + + uint32_t ibc_cost = MAX_INT; + uint32_t ibc_cost_y = MAX_INT; + uint32_t base_cost = MAX_INT; + uint32_t base_cost_y = MAX_INT; + + + if(cur_cu->type == CU_INTRA) { + uvg_intra_recon_cu(state,x, y,depth, &intra_search,NULL,lcu); + } else { + uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); + } + + bool ibc_better = false; + cur_cu->type = CU_IBC; + cur_cu->inter.mv_dir = 1; + cur_cu->skipped = false; + cur_cu->merged = false; + cur_cu->inter.mv_cand0 = 0; + optimized_sad_func_ptr_t optimized_sad = uvg_get_optimized_sad(cu_width); + uint32_t source_stride = state->tile->frame->width; + const int x_scu = SUB_SCU(x); + const int y_scu = SUB_SCU(y); + const uint32_t offset = x_scu + y_scu * LCU_WIDTH; + const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + + mv_t best_vector[2] = {0, 0}; + + + if (optimized_sad != NULL) { + base_cost_y = base_cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride); + if(state->encoder_control->chroma_format != UVG_CSP_400) { + base_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); + base_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); + } + } else { + base_cost_y = base_cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride); + if(state->encoder_control->chroma_format != UVG_CSP_400) { + base_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); + base_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); + } + } + + for(int i = 0; i < 8; i++) { + cur_cu->inter.mv[0][0] = (-cu_width - i) << UVG_IMV_4PEL; + cur_cu->inter.mv[0][1] = 0; + + if (x -cu_width - i < 0) break; + + uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); + + if (optimized_sad != NULL) { + ibc_cost_y = ibc_cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride); + if(state->encoder_control->chroma_format != UVG_CSP_400) { + ibc_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); + ibc_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); + } + } else { + ibc_cost_y = ibc_cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride); + if(state->encoder_control->chroma_format != UVG_CSP_400) { + ibc_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); + ibc_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); + } + } + if (ibc_cost_y < base_cost_y) { + ibc_better = true; + base_cost_y = ibc_cost_y; + best_vector[0] = cur_cu->inter.mv[0][0]; + best_vector[1] = cur_cu->inter.mv[0][1]; + //break; + } + } + + if (!ibc_better) *cur_cu = cu_backup; + else { + cur_cu->inter.mv[0][0] = best_vector[0]; + cur_cu->inter.mv[0][1] = best_vector[1]; + //fprintf(stderr, "Coding IBC: %d, %d: %d, %d size: %d\r\n", x,y,cur_cu->inter.mv[0][0] / 4, cur_cu->inter.mv[0][1] / 4, cu_width); + } + } + // Reconstruct best mode because we need the reconstructed pixels for // mode search of adjacent CUs. if (cur_cu->type == CU_INTRA) { From 0fdf96fab29b21cc04e50b57c453462cc6fab3f9 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Sun, 3 Jul 2022 09:41:05 +0300 Subject: [PATCH 09/36] [ibc] Change internal MV storage to INTERNAL_MV_PREC and code it as full-pel --- src/encode_coding_tree.c | 2 +- src/global.h | 6 +++--- src/inter.c | 6 +++--- src/search.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 88aec44e..c81ee323 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -906,7 +906,7 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, mv_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; mv_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; - uvg_change_precision(INTERNAL_MV_PREC, uvg_g_imv_to_prec[UVG_IMV_OFF], &mvd_hor, &mvd_ver); + uvg_change_precision(INTERNAL_MV_PREC, uvg_g_imv_to_prec[(cur_cu->type == CU_IBC)?UVG_IMV_FPEL:UVG_IMV_OFF], &mvd_hor, &mvd_ver); uvg_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out); non_zero_mvd |= (mvd_hor != 0) || (mvd_ver != 0); diff --git a/src/global.h b/src/global.h index 4dab59a0..773f9c15 100644 --- a/src/global.h +++ b/src/global.h @@ -128,9 +128,9 @@ typedef int16_t coeff_t; typedef int32_t mv_t; -#define VERBOSE 1 -#define UVG_DEBUG_PRINT_CABAC 1 -#define UVG_DEBUG 1 +//#define VERBOSE 1 +//#define UVG_DEBUG_PRINT_CABAC 1 +//#define UVG_DEBUG 1 //#define UVG_DEBUG_PRINT_YUVIEW_CSV 1 //#define UVG_DEBUG_PRINT_MV_INFO 1 diff --git a/src/inter.c b/src/inter.c index 944f9c47..c0fc0207 100644 --- a/src/inter.c +++ b/src/inter.c @@ -608,8 +608,8 @@ static void ibc_recon_cu(const encoder_state_t * const state, uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); - int32_t mv_x = cu->inter.mv[0][0] >> UVG_IMV_4PEL; - int32_t mv_y = cu->inter.mv[0][1] >> UVG_IMV_4PEL; + int32_t mv_x = cu->inter.mv[0][0] >> INTERNAL_MV_PREC; + int32_t mv_y = cu->inter.mv[0][1] >> INTERNAL_MV_PREC; uint32_t ibc_row = y / LCU_WIDTH; int32_t buffer_x = ((x - x_scu) + LCU_WIDTH <= IBC_BUFFER_WIDTH ? @@ -1599,7 +1599,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); - memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); + memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); } else { get_spatial_merge_candidates(x, y, width, height, state->tile->frame->width, diff --git a/src/search.c b/src/search.c index 9743905e..667f10bc 100644 --- a/src/search.c +++ b/src/search.c @@ -1056,7 +1056,7 @@ static double search_cu( } for(int i = 0; i < 8; i++) { - cur_cu->inter.mv[0][0] = (-cu_width - i) << UVG_IMV_4PEL; + cur_cu->inter.mv[0][0] = (-cu_width - i) * (1 << INTERNAL_MV_PREC); cur_cu->inter.mv[0][1] = 0; if (x -cu_width - i < 0) break; From 7ce01b482666f00c867fa874c1bf1e045c967c5b Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Sun, 3 Jul 2022 10:07:17 +0300 Subject: [PATCH 10/36] [ibc] Tune search costs a bit and revert debug vector scaling --- src/debug.c | 2 +- src/search.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/debug.c b/src/debug.c index 1a2f00a0..eed773ee 100644 --- a/src/debug.c +++ b/src/debug.c @@ -131,7 +131,7 @@ void uvg_dbg_yuview_init(const encoder_control_t* const encoder, char* filename, fprintf(yuview_output, "%%;scaleFactor;16\r\n"); fprintf(yuview_output, "%%;type;13;MVInterL0;vector\r\n"); fprintf(yuview_output, "%%;vectorColor;0;0;0;255\r\n"); - fprintf(yuview_output, "%%;scaleFactor;4\r\n"); + fprintf(yuview_output, "%%;scaleFactor;16\r\n"); fprintf(yuview_output, "%%;type;14;MVInterL1;vector\r\n"); fprintf(yuview_output, "%%;vectorColor;255;255;255;255\r\n"); fprintf(yuview_output, "%%;scaleFactor;16\r\n"); diff --git a/src/search.c b/src/search.c index 667f10bc..efc68a8c 100644 --- a/src/search.c +++ b/src/search.c @@ -1064,13 +1064,13 @@ static double search_cu( uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); if (optimized_sad != NULL) { - ibc_cost_y = ibc_cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride); + ibc_cost_y = ibc_cost = 3*optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride); if(state->encoder_control->chroma_format != UVG_CSP_400) { ibc_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); ibc_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); } } else { - ibc_cost_y = ibc_cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride); + ibc_cost_y = ibc_cost = 3*uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride); if(state->encoder_control->chroma_format != UVG_CSP_400) { ibc_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); ibc_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); @@ -1081,7 +1081,7 @@ static double search_cu( base_cost_y = ibc_cost_y; best_vector[0] = cur_cu->inter.mv[0][0]; best_vector[1] = cur_cu->inter.mv[0][1]; - //break; + break; } } From 48584eead949acb4dc7f5c7f568d7c8a63c006fd Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 4 Jul 2022 14:44:08 +0300 Subject: [PATCH 11/36] [ibc] Reset the jccr flags to fix a bug with IBC --- src/search.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/search.c b/src/search.c index efc68a8c..c0ce81e3 100644 --- a/src/search.c +++ b/src/search.c @@ -1010,7 +1010,7 @@ static double search_cu( // Simple IBC search if (can_use_intra && state->frame->slicetype == UVG_SLICE_I - && state->encoder_control->cfg.ibc) { + && state->encoder_control->cfg.ibc && cu_width > 4) { cu_info_t cu_backup = *cur_cu; uint32_t ibc_cost = MAX_INT; @@ -1031,6 +1031,7 @@ static double search_cu( cur_cu->skipped = false; cur_cu->merged = false; cur_cu->inter.mv_cand0 = 0; + cur_cu->joint_cb_cr = 0; optimized_sad_func_ptr_t optimized_sad = uvg_get_optimized_sad(cu_width); uint32_t source_stride = state->tile->frame->width; const int x_scu = SUB_SCU(x); From d288cc46e98615e6cdf8a6fe9c0445b56ff927c9 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Tue, 5 Jul 2022 05:18:10 +0300 Subject: [PATCH 12/36] [ibc] Fix coding of IBC in P and B slices, enable in search --- src/encode_coding_tree.c | 6 +++--- src/search.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index c81ee323..4b41260c 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -834,7 +834,7 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVMERGE_L1, abs_x, abs_y, width, height, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]); #endif } else { - if (state->frame->slicetype == UVG_SLICE_B) { + if (state->frame->slicetype == UVG_SLICE_B && cur_cu->type != CU_IBC) { // Code Inter Dir uint8_t inter_dir = cur_cu->inter.mv_dir; @@ -860,7 +860,7 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, // size of the current reference index list (L0/L1) uint8_t ref_LX_size = state->frame->ref_LX_size[ref_list_idx]; - if (ref_LX_size > 1) { + if (ref_LX_size > 1 && cur_cu->type != CU_IBC) { // parseRefFrmIdx int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; @@ -1798,7 +1798,7 @@ double uvg_mock_encode_coding_unit( CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model[ctx_predmode]), (cur_cu->type == CU_INTRA), bits, "PredMode"); } - if (cur_cu->type == CU_INTER) { + if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { const uint8_t imv_mode = UVG_IMV_OFF; const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits); if (ctrl->cfg.amvr && non_zero_mvd) { diff --git a/src/search.c b/src/search.c index c0ce81e3..3093e743 100644 --- a/src/search.c +++ b/src/search.c @@ -1009,7 +1009,7 @@ static double search_cu( } // Simple IBC search - if (can_use_intra && state->frame->slicetype == UVG_SLICE_I + if (can_use_intra //&& state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc && cu_width > 4) { cu_info_t cu_backup = *cur_cu; @@ -1174,7 +1174,7 @@ static double search_cu( } } - if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { + if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { double bits = 0; cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; From 09e62a68fea61be6a7d519c283e4300c17d8b1e4 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 6 Jul 2022 18:42:34 +0300 Subject: [PATCH 13/36] [ibc] Fix merge candidate selection bug and IBC HMVP size reset at the start of the lcu row --- src/encoderstate.c | 10 ++++++++-- src/inter.c | 28 ++++++++++++---------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index e6f8546e..48839a84 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -958,8 +958,13 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) bool wavefront = state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW; // Clear hmvp lut size before each leaf - if (!wavefront) memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); - else state->tile->frame->hmvp_size[state->wfrow->lcu_offset_y] = 0; + if (!wavefront) { + memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); + if(cfg->ibc) memset(state->tile->frame->hmvp_size_ibc, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); + } else { + state->tile->frame->hmvp_size[state->wfrow->lcu_offset_y] = 0; + state->tile->frame->hmvp_size_ibc[state->wfrow->lcu_offset_y] = 0; + } bool use_parallel_encoding = (wavefront && state->parent->children[1].encoder_control); if (!use_parallel_encoding) { @@ -1703,6 +1708,7 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict if (!state->encoder_control->tiles_enable) { memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); + memset(state->tile->frame->hmvp_size_ibc, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu); } // ROI / delta QP maps diff --git a/src/inter.c b/src/inter.c index c0fc0207..8bbe16d7 100644 --- a/src/inter.c +++ b/src/inter.c @@ -1153,6 +1153,8 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, cu_info_t *a1 = NULL; cu_info_t *b1 = NULL; + + uint8_t candidates = 0; // A1 availability testing if (x != 0) { @@ -1161,6 +1163,9 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, // the current one and the flag is not set when searching an SMP block. if (a1->type == CU_IBC) { inter_clear_cu_unused(a1); + mv_cand[candidates][0] = a1->inter.mv[0][0]; + mv_cand[candidates][1] = a1->inter.mv[0][1]; + candidates++; } else { a1 = NULL; } @@ -1173,24 +1178,14 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, // before the current one and the flag is not set when searching an SMP // block. if (b1->type == CU_IBC) { - inter_clear_cu_unused(b1); + inter_clear_cu_unused(b1); + mv_cand[candidates][0] = b1->inter.mv[0][0]; + mv_cand[candidates][1] = b1->inter.mv[0][1]; + candidates++; } else { b1 = NULL; } } - - uint8_t candidates = 0; - - // Left predictors without scaling - if (add_mvp_candidate(state, cur_cu, a1, 0, false, mv_cand[candidates])) { - candidates++; - } - - // Top predictors without scaling - if (add_mvp_candidate(state, cur_cu, b1, 0, false, mv_cand[candidates])) { - candidates++; - } - if (candidates > 0) uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); @@ -1202,8 +1197,10 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row]; - for (int i = 0; i < MIN(4,num_cand); i++) { + for (int i = 0; i < MIN(MAX_NUM_HMVP_CANDS,num_cand); i++) { cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + i]; + + mv_cand[candidates][0] = cand->inter.mv[0][0]; mv_cand[candidates][1] = cand->inter.mv[0][1]; candidates++; @@ -1928,7 +1925,6 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, for (int i = 0; i < num_cand; i++) { const cu_info_t* hmvp_cand = &state->tile->frame->hmvp_lut[ctu_row_mul_five + i]; - // ToDo: Add IBC condition if (i > 1 || ((!is_duplicate_candidate(hmvp_cand, a[1])) && (!is_duplicate_candidate(hmvp_cand, b[1]))) ) { mv_cand[candidates].mv[0][0] = state->tile->frame->hmvp_lut[ctu_row_mul_five + i].inter.mv[0][0]; From 65c017c2f2e217849513c3fde6abf8515d01119c Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 6 Jul 2022 18:56:36 +0300 Subject: [PATCH 14/36] [ibc] Add check for above block in IBC search --- src/search.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/search.c b/src/search.c index 3093e743..4e9c0107 100644 --- a/src/search.c +++ b/src/search.c @@ -1020,7 +1020,8 @@ static double search_cu( if(cur_cu->type == CU_INTRA) { - uvg_intra_recon_cu(state,x, y,depth, &intra_search,NULL,lcu); + intra_search.pred_cu.intra.mode_chroma = -1; // don't reconstruct chroma before search is performed for it + uvg_intra_recon_cu(state,x, y,depth, &intra_search,NULL,lcu); } else { uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); } @@ -1056,11 +1057,16 @@ static double search_cu( } } - for(int i = 0; i < 8; i++) { - cur_cu->inter.mv[0][0] = (-cu_width - i) * (1 << INTERNAL_MV_PREC); - cur_cu->inter.mv[0][1] = 0; - - if (x -cu_width - i < 0) break; + for(int i = -1; i < 8; i++) { + if (i == -1) { + if (y_scu < cu_width) continue; + cur_cu->inter.mv[0][0] = 0; + cur_cu->inter.mv[0][1] = (-cu_width) * (1 << INTERNAL_MV_PREC); + } else { + cur_cu->inter.mv[0][0] = (-cu_width - i) * (1 << INTERNAL_MV_PREC); + cur_cu->inter.mv[0][1] = 0; + if (x - cu_width - i < 0) break; + } uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); From 6a0e2a062dbd71cd5742ba5f2c0c9abfd134784b Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 8 Jul 2022 18:14:25 +0300 Subject: [PATCH 15/36] [ibc] Implement a proper search for IBC based on Inter search --- src/inter.c | 24 +- src/rdo.c | 103 +++++ src/rdo.h | 1 + src/search.c | 112 +---- src/search_ibc.c | 1117 ++++++++++++++++++++++++++++++++++++++++++++++ src/search_ibc.h | 55 +++ 6 files changed, 1316 insertions(+), 96 deletions(-) create mode 100644 src/search_ibc.c create mode 100644 src/search_ibc.h diff --git a/src/inter.c b/src/inter.c index 8bbe16d7..1cdc77af 100644 --- a/src/inter.c +++ b/src/inter.c @@ -618,7 +618,7 @@ static void ibc_recon_cu(const encoder_state_t * const state, int32_t buffer_y = y_scu + mv_y; // The whole block must be to the left of the current position - assert(-mv_x >= width); + assert((-mv_x >= width || -mv_y >= width) && x >= 0 && y >= 0); // Predicted block completely outside of this LCU if (mv_x + x_scu + width <= 0) { @@ -687,7 +687,7 @@ void uvg_inter_pred_pu(const encoder_state_t * const state, const int pu_h = PU_GET_H(cu->part_size, width, i_pu); cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); - if (pu->type == CU_IBC) { + if (cu->type == CU_IBC) { ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu); } else { @@ -710,11 +710,8 @@ void uvg_inter_pred_pu(const encoder_state_t * const state, predict_chroma); } else { const int mv_idx = pu->inter.mv_dir - 1; - const uvg_picture * const ref = - (cu->type == CU_IBC) ? - state->tile->frame->rec : - (state->frame->ref - ->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]]); + const uvg_picture * const ref = + state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]]; const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x); const unsigned offset_chroma = @@ -1856,6 +1853,19 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; merge_candidates_t merge_cand = { 0 }; const uint8_t max_num_cands = state->encoder_control->cfg.max_merge; + + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + if(cur_cu->type == CU_IBC) { + mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; + get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); + for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) { + mv_cand[i].dir = 1; + mv_cand[i].mv[0][0] = ibc_mv_cand[i][0]; + mv_cand[i].mv[0][1] = ibc_mv_cand[i][1]; + } + return IBC_MRG_MAX_NUM_CANDS; + } + get_spatial_merge_candidates(x, y, width, height, state->tile->frame->width, state->tile->frame->height, diff --git a/src/rdo.c b/src/rdo.c index 51131c6a..f8ebacdf 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1773,6 +1773,109 @@ double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state, return bits; } + +/** MVD cost calculation with CABAC +* \returns int +* Calculates Motion Vector cost and related costs using CABAC coding +*/ +double uvg_calc_ibc_mvd_cost_cabac(const encoder_state_t * state, + int x, + int y, + int mv_shift, + mv_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + double* bitcost) +{ + cabac_data_t state_cabac_copy; + cabac_data_t* cabac; + uint32_t merge_idx; + vector2d_t mvd = { 0, 0 }; + int8_t merged = 0; + int8_t cur_mv_cand = 0; + + x *= 1 << mv_shift; + y *= 1 << mv_shift; + + // Check every candidate to find a match + for (merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) { + if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x && + merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y) + { + merged = 1; + break; + } + } + + // Store cabac state and contexts + memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t)); + + // Clear bytes and bits and set mode to "count" + state_cabac_copy.only_count = 1; + + cabac = &state_cabac_copy; + double bits = 0; + + if (!merged) { + vector2d_t mvd1 = { + x - mv_cand[0][0], + y - mv_cand[0][1], + }; + vector2d_t mvd2 = { + x - mv_cand[1][0], + y - mv_cand[1][1], + }; + + uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1); + uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2); + + double cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); + double cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); + + // Select candidate 1 if it has lower cost + if (cand2_cost < cand1_cost) { + cur_mv_cand = 1; + mvd = mvd2; + } else { + mvd = mvd1; + } + } + + cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); + + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag"); + num_cand = state->encoder_control->cfg.max_merge; + if (merged) { + if (num_cand > 1) { + int32_t ui; + for (ui = 0; ui < num_cand - 1; ui++) { + int32_t symbol = (ui != merge_idx); + if (ui == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex"); + } else { + CABAC_BIN_EP(cabac, symbol, "MergeIndex"); + bits += 1; + } + if (symbol == 0) break; + } + } + } else { + + // It is safe to drop const here because cabac->only_count is set. + uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits); + + // Signal which candidate MV to use + cabac->cur_ctx = &(cabac->ctx.mvp_idx_model); + CABAC_BIN(cabac, cur_mv_cand, "mvp_flag"); + } + + *bitcost = bits; + + // Store bitcost before restoring cabac + return *bitcost * state->lambda_sqrt; +} + /** MVD cost calculation with CABAC * \returns int * Calculates Motion Vector cost and related costs using CABAC coding diff --git a/src/rdo.h b/src/rdo.h index b7f93729..7f325cfd 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -88,6 +88,7 @@ uint32_t uvg_get_coded_level(encoder_state_t * state, double* coded_cost, double int32_t q_bits,double temp, int8_t last, int8_t type); uvg_mvd_cost_func uvg_calc_mvd_cost_cabac; +uvg_mvd_cost_func uvg_calc_ibc_mvd_cost_cabac; double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state, const cabac_data_t* cabac, diff --git a/src/search.c b/src/search.c index 4e9c0107..c30b3686 100644 --- a/src/search.c +++ b/src/search.c @@ -45,6 +45,7 @@ #include "rdo.h" #include "search_inter.h" #include "search_intra.h" +#include "search_ibc.h" #include "threadqueue.h" #include "transform.h" #include "videoframe.h" @@ -306,7 +307,7 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state, lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; - const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0); cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; // cur_cu is used for TU parameters. @@ -380,7 +381,7 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 }; const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); - const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0); double tr_tree_bits = 0; double coeff_bits = 0; @@ -477,7 +478,7 @@ static double cu_rd_cost_tr_split_accurate( enum uvg_tree_type tree_type) { const int width = LCU_WIDTH >> depth; - const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0); + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0); // cur_cu is used for TU parameters. cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); @@ -499,7 +500,7 @@ static double cu_rd_cost_tr_split_accurate( int cbf = cbf_is_set_any(pred_cu->cbf, depth); // Only need to signal coded block flag if not skipped or merged // skip = no coded residual, merge = coded residual - if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { + if (pred_cu->type != CU_INTRA && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); } @@ -1010,93 +1011,26 @@ static double search_cu( // Simple IBC search if (can_use_intra //&& state->frame->slicetype == UVG_SLICE_I - && state->encoder_control->cfg.ibc && cu_width > 4) { - cu_info_t cu_backup = *cur_cu; + && state->encoder_control->cfg.ibc + && cu_width > 4 + && (x >= cu_width || y >= cu_width)) { - uint32_t ibc_cost = MAX_INT; - uint32_t ibc_cost_y = MAX_INT; - uint32_t base_cost = MAX_INT; - uint32_t base_cost_y = MAX_INT; + cu_info_t backup_cu = *cur_cu; - - if(cur_cu->type == CU_INTRA) { - intra_search.pred_cu.intra.mode_chroma = -1; // don't reconstruct chroma before search is performed for it - uvg_intra_recon_cu(state,x, y,depth, &intra_search,NULL,lcu); + double mode_cost; + double mode_bitcost; + uvg_search_cu_ibc(state, + x, y, + depth, + lcu, + &mode_cost, &mode_bitcost); + if (mode_cost < cost) { + cost = mode_cost; + inter_bitcost = mode_bitcost; + cur_cu->type = CU_IBC; + cur_cu->joint_cb_cr = 0; } else { - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); - } - - bool ibc_better = false; - cur_cu->type = CU_IBC; - cur_cu->inter.mv_dir = 1; - cur_cu->skipped = false; - cur_cu->merged = false; - cur_cu->inter.mv_cand0 = 0; - cur_cu->joint_cb_cr = 0; - optimized_sad_func_ptr_t optimized_sad = uvg_get_optimized_sad(cu_width); - uint32_t source_stride = state->tile->frame->width; - const int x_scu = SUB_SCU(x); - const int y_scu = SUB_SCU(y); - const uint32_t offset = x_scu + y_scu * LCU_WIDTH; - const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; - - mv_t best_vector[2] = {0, 0}; - - - if (optimized_sad != NULL) { - base_cost_y = base_cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride); - if(state->encoder_control->chroma_format != UVG_CSP_400) { - base_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); - base_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); - } - } else { - base_cost_y = base_cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride); - if(state->encoder_control->chroma_format != UVG_CSP_400) { - base_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); - base_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); - } - } - - for(int i = -1; i < 8; i++) { - if (i == -1) { - if (y_scu < cu_width) continue; - cur_cu->inter.mv[0][0] = 0; - cur_cu->inter.mv[0][1] = (-cu_width) * (1 << INTERNAL_MV_PREC); - } else { - cur_cu->inter.mv[0][0] = (-cu_width - i) * (1 << INTERNAL_MV_PREC); - cur_cu->inter.mv[0][1] = 0; - if (x - cu_width - i < 0) break; - } - - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); - - if (optimized_sad != NULL) { - ibc_cost_y = ibc_cost = 3*optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride); - if(state->encoder_control->chroma_format != UVG_CSP_400) { - ibc_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); - ibc_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2); - } - } else { - ibc_cost_y = ibc_cost = 3*uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride); - if(state->encoder_control->chroma_format != UVG_CSP_400) { - ibc_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); - ibc_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2); - } - } - if (ibc_cost_y < base_cost_y) { - ibc_better = true; - base_cost_y = ibc_cost_y; - best_vector[0] = cur_cu->inter.mv[0][0]; - best_vector[1] = cur_cu->inter.mv[0][1]; - break; - } - } - - if (!ibc_better) *cur_cu = cu_backup; - else { - cur_cu->inter.mv[0][0] = best_vector[0]; - cur_cu->inter.mv[0][1] = best_vector[1]; - //fprintf(stderr, "Coding IBC: %d, %d: %d, %d size: %d\r\n", x,y,cur_cu->inter.mv[0][0] / 4, cur_cu->inter.mv[0][1] / 4, cu_width); + *cur_cu = backup_cu; } } @@ -1180,7 +1114,7 @@ static double search_cu( } } - if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { + if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { double bits = 0; cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; diff --git a/src/search_ibc.c b/src/search_ibc.c new file mode 100644 index 00000000..c3ad713a --- /dev/null +++ b/src/search_ibc.c @@ -0,0 +1,1117 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2022, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "search_ibc.h" +#include "search_inter.h" + +#include +#include + +#include "cabac.h" +#include "encoder.h" +#include "encode_coding_tree.h" +#include "image.h" +#include "imagelist.h" +#include "inter.h" +#include "uvg266.h" +#include "rdo.h" +#include "search.h" +#include "strategies/strategies-ipol.h" +#include "strategies/strategies-picture.h" +#include "transform.h" +#include "videoframe.h" + +typedef struct { + encoder_state_t *state; + + /** + * \brief Current frame + */ + const uvg_picture *pic; + + /** + * \brief Top-left corner of the PU + */ + vector2d_t origin; + int32_t width; + int32_t height; + + mv_t mv_cand[2][2]; + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS]; + int32_t num_merge_cand; + + uvg_mvd_cost_func *mvd_cost_func; + + /** + * \brief Possible optimized SAD implementation for the width, leave as + * NULL for arbitrary-width blocks + */ + optimized_sad_func_ptr_t optimized_sad; + + lcu_t *lcu; + +} ibc_search_info_t; + + + + + +/** + * \return True if referred block is within current tile. + */ +static INLINE bool intmv_within_ibc_range(const ibc_search_info_t *info, int x, int y) +{ + bool negative_values = x <= 0 && y <= 0; + bool mv_range_valid = ((-y >= info->height) || (-x >= info->width)) && // Must be block height/width away from the block + SUB_SCU(info->origin.y) >= -y && // Y vector must be inside the current CTU + (-x <= IBC_BUFFER_WIDTH-LCU_WIDTH); // X must be inside the buffer + + + return negative_values && mv_range_valid; +} + +static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x, int y) +{ + return intmv_within_ibc_range( + info, + x >> INTERNAL_MV_PREC, + y >> INTERNAL_MV_PREC); +} + + +static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y) +{ + const int x_scu = SUB_SCU(x); + const int y_scu = SUB_SCU(y); + + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); + + cu_info_t cu_backup = *cur_cu; + uint32_t cost = MAX_INT; + + + const uint32_t offset = x_scu + y_scu * LCU_WIDTH; + const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + + cur_cu->type = CU_IBC; + cur_cu->inter.mv_dir = 1; + cur_cu->skipped = false; + cur_cu->merged = false; + cur_cu->inter.mv_cand0 = 0; + cur_cu->joint_cb_cr = 0; + cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);; + cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);; + + uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400); + + *cur_cu = cu_backup; + + cost = uvg_satd_any_size(width, + width, + lcu->rec.y + offset, + LCU_WIDTH, + &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], + state->tile->frame->source->stride) >> (UVG_BIT_DEPTH - 8); + + if(state->encoder_control->chroma_format != UVG_CSP_400) { + cost += uvg_satd_any_size(width / 2, + width / 2, + lcu->rec.u + offset_c, + LCU_WIDTH_C, + &state->tile->frame->source->u[(y / 2) * (state->tile->frame->source->stride / 2) + (x / 2)], + state->tile->frame->source->stride / 2) >> (UVG_BIT_DEPTH - 8); + cost += uvg_satd_any_size(width / 2, + width / 2, + lcu->rec.v + offset_c, + LCU_WIDTH_C, + &state->tile->frame->source->v[(y / 2) * (state->tile->frame->source->stride / 2) + (x / 2)], + state->tile->frame->source->stride / 2) >> (UVG_BIT_DEPTH - 8); + } + + return cost; +} + + +static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y) +{ + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + cu_info_t cu_backup = *cur_cu; + uint32_t cost = MAX_INT; + + const int x_scu = SUB_SCU(x); + const int y_scu = SUB_SCU(y); + const uint32_t offset = x_scu + y_scu * LCU_WIDTH; + const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + + cur_cu->type = CU_IBC; + cur_cu->inter.mv_dir = 1; + cur_cu->skipped = false; + cur_cu->merged = false; + cur_cu->inter.mv_cand0 = 0; + cur_cu->joint_cb_cr = 0; + cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);; + cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);; + + uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400); + + *cur_cu = cu_backup; + + if (optimized_sad != NULL) { + cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride); + if(state->encoder_control->chroma_format != UVG_CSP_400) { + cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); + cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); + } + } else { + cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width,width, LCU_WIDTH, state->tile->frame->source->stride); + if(state->encoder_control->chroma_format != UVG_CSP_400) { + cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); + cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); + } + } + + return cost; +} + +static bool check_mv_cost_satd(ibc_search_info_t *info, + int x, + int y, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) +{ + + } +/** + * \brief Calculate cost for an integer motion vector. + * + * Updates best_mv, best_cost and best_bitcost to the new + * motion vector if it yields a lower cost than the current one. + * + * If the motion vector violates the MV constraints for tiles or WPP, the + * cost is not set. + * + * \return true if best_mv was changed, false otherwise + */ +static bool check_mv_cost(ibc_search_info_t *info, + int x, + int y, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) +{ + if (!intmv_within_ibc_range(info, x, y)) return false; + + double bitcost = 0; + double cost = MAX_DOUBLE; + + cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y); + + if (cost >= *best_cost) return false; + + cost += info->mvd_cost_func( + info->state, + x, y, INTERNAL_MV_PREC, + info->mv_cand, + NULL, + 0, + NULL, + &bitcost + ); + + if (cost >= *best_cost) return false; + + // Set to motion vector in internal pixel precision. + best_mv->x = x * (1 << INTERNAL_MV_PREC); + best_mv->y = y * (1 << INTERNAL_MV_PREC); + *best_cost = cost; + *best_bits = bitcost; + + return true; +} + + +static unsigned get_ep_ex_golomb_bitcost(unsigned symbol) +{ + // Calculate 2 * log2(symbol ) + + unsigned bins = 0; + symbol += 0; + if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; } + if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; } + if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; } + if (symbol >= 1 << 1) { bins += 2; } + + // TODO: It might be a good idea to put a small slope on this function to + // make sure any search function that follows the gradient heads towards + // a smaller MVD, but that would require fractinal costs and bits being + // used everywhere in inter search. + // return num_bins + 0.001 * symbol; + + return bins; +} + + +/** + * \brief Checks if mv is one of the merge candidates. + * \return true if found else return false + */ +static bool mv_in_merge(const ibc_search_info_t *info, vector2d_t mv) +{ + for (int i = 0; i < info->num_merge_cand; ++i) { + if (info->merge_cand[i].dir == 3) continue; + const vector2d_t merge_mv = { + info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0], + info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] + }; + if (merge_mv.x == mv.x * (1 << (INTERNAL_MV_PREC)) && merge_mv.y == mv.y * (1 << (INTERNAL_MV_PREC))) { + return true; + } + } + return false; +} + + +/** + * \brief Select starting point for integer motion estimation search. + * + * Checks the zero vector, extra_mv and merge candidates and updates + * best_mv to the best one. + */ +static void select_starting_point(ibc_search_info_t *info, + vector2d_t extra_mv, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) +{ + // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list. + check_mv_cost(info, -info->width, 0, best_cost, best_bits, best_mv); + + // Change to integer precision. + extra_mv.x >>= INTERNAL_MV_PREC; + extra_mv.y >>= INTERNAL_MV_PREC; + + // Check mv_in if it's not one of the merge candidates. + if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) { + check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); + } + + // Go through candidates + for (int32_t i = 0; i < info->num_merge_cand; ++i) { + int32_t x = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] + (1 << (INTERNAL_MV_PREC - 1)) ) >> INTERNAL_MV_PREC; + int32_t y = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] + (1 << (INTERNAL_MV_PREC - 1)) ) >> INTERNAL_MV_PREC; + + check_mv_cost(info, x, y, best_cost, best_bits, best_mv); + } +} + + +static double get_ibc_mvd_coding_cost(const encoder_state_t* state, + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) +{ + double bitcost = 4 << CTX_FRAC_BITS; + const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; + bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS)); + + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; + + // Round and shift back to integer bits. + return bitcost / (1 << CTX_FRAC_BITS); +} + + +static int select_ibc_mv_cand(const encoder_state_t *state, + mv_t mv_cand[2][2], + int32_t mv_x, + int32_t mv_y, + double*cost_out) +{ + const bool same_cand = + (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]); + + if (same_cand && !cost_out) { + // Pick the first one if both candidates are the same. + return 0; + } + + double (*mvd_coding_cost)(const encoder_state_t * const state, + const cabac_data_t*, + int32_t, int32_t); + if (state->encoder_control->cfg.mv_rdo) { + mvd_coding_cost = uvg_get_mvd_coding_cost_cabac; + } else { + mvd_coding_cost = get_ibc_mvd_coding_cost; + } + + vector2d_t mvd = { mv_x - mv_cand[0][0], mv_y - mv_cand[0][1] }; + + uvg_change_precision_vector2d(INTERNAL_MV_PREC, UVG_IMV_FPEL, &mvd); + + double cand1_cost = mvd_coding_cost( + state, &state->cabac, + mvd.x, + mvd.y); + + double cand2_cost; + if (same_cand) { + cand2_cost = cand1_cost; + } else { + vector2d_t mvd2 = { mv_x - mv_cand[1][0], mv_y - mv_cand[1][1] }; + uvg_change_precision_vector2d(INTERNAL_MV_PREC, UVG_IMV_FPEL, &mvd2); + cand2_cost = mvd_coding_cost( + state, &state->cabac, + mvd2.x, + mvd2.y); + } + + if (cost_out) { + *cost_out = MIN(cand1_cost, cand2_cost); + } + + // Pick the second candidate if it has lower cost. + return cand2_cost < cand1_cost ? 1 : 0; +} + + +static double calc_ibc_mvd_cost(const encoder_state_t *state, + int x, + int y, + int mv_shift, + mv_t mv_cand[2][2], + inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], + int16_t num_cand, + int32_t ref_idx, + double* bitcost) +{ + double temp_bitcost = 0; + uint32_t merge_idx; + int8_t merged = 0; + + x *= 1 << mv_shift; + y *= 1 << mv_shift; + + // Check every candidate to find a match + for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) { + if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x && + merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y) { + temp_bitcost += merge_idx; + merged = 1; + break; + } + } + + // Check mvd cost only if mv is not merged + if (!merged) { + double mvd_cost = 0; + select_ibc_mv_cand(state, mv_cand, x, y, &mvd_cost); + temp_bitcost += mvd_cost; + } + *bitcost = temp_bitcost; + return temp_bitcost * state->lambda_sqrt; +} + + +static bool early_terminate(ibc_search_info_t *info, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) +{ + static const vector2d_t small_hexbs[7] = { + { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 }, + { 0, -1 }, { -1, 0 }, { 0, 0 }, + }; + + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; + + int first_index = 0; + int last_index = 3; + + for (int k = 0; k < 2; ++k) { + double threshold; + if (info->state->encoder_control->cfg.me_early_termination == + UVG_ME_EARLY_TERMINATION_SENSITIVE) + { + threshold = *best_cost * 0.95; + } else { + threshold = *best_cost; + } + + int best_index = 6; + for (int i = first_index; i <= last_index; i++) { + int x = mv.x + small_hexbs[i].x; + int y = mv.y + small_hexbs[i].y; + + if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) { + best_index = i; + } + } + + // Adjust the movement vector + mv.x += small_hexbs[best_index].x; + mv.y += small_hexbs[best_index].y; + + // If best match is not better than threshold, we stop the search. + if (*best_cost >= threshold) { + return true; + } + + first_index = (best_index + 3) % 4; + last_index = first_index + 2; + } + return false; +} + + + +/** + * \brief Do motion search using the HEXBS algorithm. + * + * \param info search info + * \param extra_mv extra motion vector to check + * \param steps how many steps are done at maximum before exiting, does not affect the final step + * + * Motion vector is searched by first searching iteratively with the large + * hexagon pattern until the best match is at the center of the hexagon. + * As a final step a smaller hexagon is used to check the adjacent pixels. + * + * If a non 0,0 predicted motion vector predictor is given as extra_mv, + * the 0,0 vector is also tried. This is hoped to help in the case where + * the predicted motion vector is way off. In the future even more additional + * points like 0,0 might be used, such as vectors from top or left. + */ +static void hexagon_search(ibc_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) +{ + // The start of the hexagonal pattern has been repeated at the end so that + // the indices between 1-6 can be used as the start of a 3-point list of new + // points to search. + // 6--1,7 + // / \ =) + // 5 0 2,8 + // \ / + // 4---3 + static const vector2d_t large_hexbs[9] = { + { 0, 0 }, + { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 }, { -1, -2 }, + { 1, -2 }, { 2, 0 } + }; + // This is used as the last step of the hexagon search. + // 1 + // 2 0 3 + // 4 + static const vector2d_t small_hexbs[9] = { + { 0, 0 }, + { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 }, + { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } + }; + + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; + + // Current best index, either to merge_cands, large_hexbs or small_hexbs. + int best_index = 0; + + // Search the initial 7 points of the hexagon. + for (int i = 1; i < 7; ++i) { + if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) { + best_index = i; + } + } + + // Iteratively search the 3 new points around the best match, until the best + // match is in the center. + while (best_index != 0 && steps != 0) { + // decrement count if enabled + if (steps > 0) steps -= 1; + + // Starting point of the 3 offsets to be searched. + unsigned start; + if (best_index == 1) { + start = 6; + } else if (best_index == 8) { + start = 1; + } else { + start = best_index - 1; + } + + // Move the center to the best match. + mv.x += large_hexbs[best_index].x; + mv.y += large_hexbs[best_index].y; + best_index = 0; + + // Iterate through the next 3 points. + for (int i = 0; i < 3; ++i) { + vector2d_t offset = large_hexbs[start + i]; + if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) { + best_index = start + i; + } + } + } + + // Move the center to the best match. + //mv.x += large_hexbs[best_index].x; + //mv.y += large_hexbs[best_index].y; + + // Do the final step of the search with a small pattern. + for (int i = 1; i < 9; ++i) { + check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv); + } +} + +/** +* \brief Do motion search using the diamond algorithm. +* +* \param info search info +* \param extra_mv extra motion vector to check +* \param steps how many steps are done at maximum before exiting +* +* Motion vector is searched by searching iteratively with a diamond-shaped +* pattern. We take care of not checking the direction we came from, but +* further checking for avoiding visits to already visited points is not done. +* +* If a non 0,0 predicted motion vector predictor is given as extra_mv, +* the 0,0 vector is also tried. This is hoped to help in the case where +* the predicted motion vector is way off. In the future even more additional +* points like 0,0 might be used, such as vectors from top or left. +**/ +static void diamond_search(ibc_search_info_t *info, + vector2d_t extra_mv, + uint32_t steps, + double *best_cost, + double* best_bits, + vector2d_t *best_mv) +{ + enum diapos { + DIA_UP = 0, + DIA_RIGHT = 1, + DIA_LEFT = 2, + DIA_DOWN = 3, + DIA_CENTER = 4, + }; + + // a diamond shape with the center included + // 0 + // 2 4 1 + // 3 + static const vector2d_t diamond[5] = { + {0, -1}, {1, 0}, {0, 1}, {-1, 0}, + {0, 0} + }; + + // current motion vector + vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC }; + + // current best index + enum diapos best_index = DIA_CENTER; + + // initial search of the points of the diamond + for (int i = 0; i < 5; ++i) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { + best_index = i; + } + } + + if (best_index == DIA_CENTER) { + // the center point was the best in initial check + return; + } + + // Move the center to the best match. + mv.x += diamond[best_index].x; + mv.y += diamond[best_index].y; + + // the arrival direction, the index of the diamond member that will be excluded + enum diapos from_dir = DIA_CENTER; + + // whether we found a better candidate this iteration + uint8_t better_found; + + do { + better_found = 0; + // decrement count if enabled + if (steps > 0) steps -= 1; + + // search the points of the diamond + for (int i = 0; i < 4; ++i) { + // this is where we came from so it's checked already + if (i == from_dir) continue; + + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) { + best_index = i; + better_found = 1; + } + } + + if (better_found) { + // Move the center to the best match. + mv.x += diamond[best_index].x; + mv.y += diamond[best_index].y; + + // record where we came from to the next iteration + // the xor operation flips the orientation + from_dir = best_index ^ 0x3; + } + } while (better_found && steps != 0); + // and we're done +} + + +/** + * \brief Check if an identical merge candidate exists in a list + * + * \param all_cand Full list of available merge candidates + * \param cand_to_add Merge candidate to be checked for duplicates + * \param added_idx_list List of indices of unique merge candidates + * \param list_size Size of the list + * + * \return Does an identical candidate exist in list + */ +static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, + inter_merge_cand_t *cand_to_add, + unit_stats_map_t *merge) +{ + bool found = false; + for (int i = 0; i < merge->size && !found; ++i) { + int key = merge->keys[i]; + inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx]; + + found = + cand_to_add->mv[0][0] == list_cand->mv[0][0] && + cand_to_add->mv[0][1] == list_cand->mv[0][1]; + } + + return found; +} + +/** + * \brief Collect PU parameters and costs at this depth. + * + * \param state encoder state + * \param x_cu x-coordinate of the containing CU + * \param y_cu y-coordinate of the containing CU + * \param depth depth of the CU in the quadtree + * \param part_mode partition mode of the CU + * \param i_pu index of the PU in the CU + * \param lcu containing LCU + * + * \param amvp Return searched AMVP PUs sorted by costs + * \param merge Return searched Merge PUs sorted by costs + */ +static void search_pu_ibc(encoder_state_t * const state, + int x_cu, int y_cu, + int depth, + part_mode_t part_mode, + int i_pu, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + ibc_search_info_t *info) +{ + const uvg_config *cfg = &state->encoder_control->cfg; + const videoframe_t * const frame = state->tile->frame; + const int width_cu = LCU_WIDTH >> depth; + const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); + const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); + const int width = PU_GET_W(part_mode, width_cu, i_pu); + const int height = PU_GET_H(part_mode, width_cu, i_pu); + + // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and + // nRx2N partitions. + const bool merge_a1 = i_pu == 0 || width >= height; + // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and + // 2NxnD partitions. + const bool merge_b1 = i_pu == 0 || width <= height; + + + lcu_t *lcu = info->lcu; + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_IBC; + cur_pu->part_size = part_mode; + cur_pu->depth = depth; + cur_pu->tr_depth = depth; + cur_pu->qp = state->qp; + + // Default to candidate 0 + CU_SET_MV_CAND(cur_pu, 0, 0); + + FILL(*info, 0); + + info->state = state; + info->pic = frame->source; + info->origin.x = x; + info->origin.y = y; + info->width = width; + info->height = height; + info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; + info->optimized_sad = uvg_get_optimized_sad(width); + info->lcu = lcu; + + // Search for merge mode candidates + info->num_merge_cand = uvg_inter_get_merge_cand( + state, + x, y, + width, height, + merge_a1, merge_b1, + info->merge_cand, + lcu); + + // Merge Analysis starts here + merge->size = 0; + for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) { + merge->keys[i] = -1; + merge->cost[i] = MAX_DOUBLE; + } + + const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1); +#ifdef COMPLETE_PRED_MODE_BITS + // Technically counting these bits would be correct, however counting + // them universally degrades quality so this block is disabled by default + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0); +#else + const double no_skip_flag = 0; +#endif + // Check motion vector constraints and perform rough search + for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) { + + inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx]; + cur_pu->inter.mv_dir = cur_cand->dir; + cur_pu->inter.mv[0][0] = cur_cand->mv[0][0]; + cur_pu->inter.mv[0][1] = cur_cand->mv[0][1]; + + + bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); + + // Don't try merge candidates that don't satisfy mv constraints. + // Don't add duplicates to list + if ((!fracmv_within_ibc_range(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) || + is_duplicate) + { + continue; + } + uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu); + merge->unit[merge->size] = *cur_pu; + merge->unit[merge->size].type = CU_IBC; + merge->unit[merge->size].merge_idx = merge_idx; + merge->unit[merge->size].merged = true; + merge->unit[merge->size].skipped = false; + + double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); + if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + } + else { + merge->cost[merge->size] = uvg_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + bits += no_skip_flag; + merge->cost[merge->size] += bits * info->state->lambda_sqrt; + } + // Add cost of coding the merge index + merge->bits[merge->size] = bits; + merge->keys[merge->size] = merge->size; + + + merge->size++; + } + + assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE); + uvg_sort_keys_by_cost(merge); + + // Try early skip decision on just one merge candidate if available + int num_rdo_cands = MIN(1, merge->size); + + // Early Skip Mode Decision + bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; + if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { + if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { + merge->size = 1; + merge->bits[0] = merge->bits[merge->keys[merge_key]]; + merge->cost[0] = merge->cost[merge->keys[merge_key]]; + merge->unit[0] = merge->unit[merge->keys[merge_key]]; + merge->keys[0] = 0; + } + else if(cfg->rdo < 2) { + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = merge->unit[merge->keys[merge_key]].merge_idx; + cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; + cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; + cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; + uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + uvg_inter_recon_cu(state, lcu, x, y, width, true, false); + uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true); + + if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + continue; + } + else if (has_chroma) { + uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + uvg_quantize_lcu_residual(state, false, has_chroma, + false, /*we are only checking for lack of coeffs so no need to check jccr*/ + x, y, depth, cur_pu, lcu, true); + if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cur_pu->type = CU_IBC; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = true; + + merge->size = 1; + merge->cost[0] = 0.0; // TODO: Check this + merge->bits[0] = merge_idx; // TODO: Check this + merge->unit[0] = *cur_pu; + return; + } + } + } + } + } + + // AMVP search starts here + amvp[0].size = 0; + amvp[0].cost[0] = MAX_DOUBLE; + + + // Do the motion search + + uvg_inter_get_mv_cand(info->state, + info->origin.x, + info->origin.y, + info->width, + info->height, + info->mv_cand, + cur_pu, + lcu, + NULL); + + vector2d_t best_mv = { 0, 0 }; + + double best_cost = MAX_DOUBLE; + double best_bits = MAX_INT; + + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv); + bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv); + + if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) { + + switch (cfg->ime_algorithm) { + case UVG_IME_DIA: + diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + default: + hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps, + &best_cost, &best_bits, &best_mv); + break; + } + } + + if (best_cost < MAX_DOUBLE) { + // Recalculate inter cost with SATD. + best_cost = calculate_ibc_cost_satd( + info->state, + lcu, + info->origin.x, + info->origin.y, + info->width, + (best_mv.x >> INTERNAL_MV_PREC), + (best_mv.y >> INTERNAL_MV_PREC)); + best_cost += best_bits * info->state->lambda_sqrt; + } + + + int cu_mv_cand = select_ibc_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL); + + // Update best unipreds for biprediction + bool valid_mv = fracmv_within_ibc_range(info, best_mv.x, best_mv.y); + if (valid_mv && best_cost < MAX_DOUBLE) { + + // Map reference index to L0/L1 pictures + unit_stats_map_t *cur_map = &amvp[0]; + int entry = cur_map->size; + cu_info_t *unipred_pu = &cur_map->unit[entry]; + *unipred_pu = *cur_pu; + unipred_pu->type = CU_IBC; + unipred_pu->merged = false; + unipred_pu->skipped = false; + unipred_pu->inter.mv_dir = 1; + unipred_pu->inter.mv[0][0] = (mv_t)best_mv.x; + unipred_pu->inter.mv[0][1] = (mv_t)best_mv.y; + CU_SET_MV_CAND(unipred_pu, 0, cu_mv_cand); + + cur_map->cost[entry] = best_cost; + cur_map->bits[entry] = best_bits; + cur_map->keys[entry] = entry; + cur_map->size++; + } + + + assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); + uvg_sort_keys_by_cost(&amvp[0]); + + int best_keys[2] = { + amvp[0].size > 0 ? amvp[0].keys[0] : 0, + amvp[1].size > 0 ? amvp[1].keys[0] : 0 + }; + + cu_info_t *best_unipred[2] = { + &amvp[0].unit[best_keys[0]], + &amvp[1].unit[best_keys[1]] + }; + + + if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { + if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + } + + + if(cfg->rdo < 2) { + int predmode_ctx; + const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); + + const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); + const double total_bits = no_skip_flag + pred_mode_bits; + if(amvp[0].size > 0) { + const uint8_t best_key = amvp[0].keys[0]; + amvp[0].bits[best_key] += total_bits; + amvp[0].cost[best_key] += (total_bits)* state->lambda_sqrt; + } + } +} + + + +/** + * \brief Update CU to have best modes at this depth. + * + * Only searches the 2Nx2N partition mode. + * + * \param state encoder state + * \param x x-coordinate of the CU + * \param y y-coordinate of the CU + * \param depth depth of the CU in the quadtree + * \param lcu containing LCU + * + * \param inter_cost Return inter cost + * \param inter_bitcost Return inter bitcost + */ +void uvg_search_cu_ibc(encoder_state_t * const state, + int x, int y, int depth, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost) +{ + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + + // Store information of L0, L1, and bipredictions. + // Best cost will be left at MAX_DOUBLE if no valid CU is found. + // These will be initialized by the following function. + unit_stats_map_t amvp[3]; + unit_stats_map_t merge; + ibc_search_info_t info; + + info.lcu = lcu; + + search_pu_ibc(state, + x, y, depth, + SIZE_2Nx2N, 0, + amvp, + &merge, + &info); + + // Early Skip CU decision + if (merge.size == 1 && merge.unit[0].skipped) { + *inter_cost = merge.cost[0]; + *inter_bitcost = merge.bits[0]; + return; + } + + cu_info_t *best_inter_pu = NULL; + + // Find best AMVP PU + for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { + + int best_key = amvp[mv_dir - 1].keys[0]; + + if (amvp[mv_dir - 1].size > 0 && + amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + + best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; + *inter_cost = amvp[mv_dir - 1].cost[best_key]; + *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; + } + } + + // Compare best AMVP against best Merge mode + int best_merge_key = merge.keys[0]; + + if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) { + + best_inter_pu = &merge.unit[best_merge_key]; + *inter_cost = merge.cost[best_merge_key]; + *inter_bitcost = 0; // TODO: Check this + } + + if (*inter_cost == MAX_DOUBLE) { + // Could not find any motion vector. + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_INT; + return; + } + + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + *cur_pu = *best_inter_pu; + cur_pu->type = CU_IBC; + + uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), + true, state->encoder_control->chroma_format != UVG_CSP_400); + + if (*inter_cost < MAX_DOUBLE) { + assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); + } +} diff --git a/src/search_ibc.h b/src/search_ibc.h new file mode 100644 index 00000000..14ce3b6f --- /dev/null +++ b/src/search_ibc.h @@ -0,0 +1,55 @@ +#pragma once + +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Compression + * \file + * Inter prediction parameter search. + */ + +#include "cu.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "inter.h" +#include "uvg266.h" + + +void uvg_search_cu_ibc(encoder_state_t * const state, + int x, int y, int depth, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost); + + + From 6de2e2d581d7f070c0870c3bb02218d71f18ce5c Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 3 Aug 2022 10:46:02 +0300 Subject: [PATCH 16/36] [ibc] Fix some git merge issues and IBC merge candidate selection --- src/inter.c | 75 ++++++++++++++++++++++++++++++++---------------- src/search_ibc.c | 10 +++---- 2 files changed, 56 insertions(+), 29 deletions(-) diff --git a/src/inter.c b/src/inter.c index 1cdc77af..920c8d45 100644 --- a/src/inter.c +++ b/src/inter.c @@ -985,11 +985,10 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, } } - -static INLINE int16_t get_scaled_mv(int16_t mv, int scale) +static INLINE mv_t get_scaled_mv(mv_t mv, int scale) { int32_t scaled = scale * mv; - return CLIP(-32768, 32767, (scaled + 127 + (scaled < 0)) >> 8); + return CLIP(-131072, 131071, (scaled + 127 + (scaled < 0)) >> 8); } #define MV_EXPONENT_BITCOUNT 4 @@ -1108,6 +1107,19 @@ static INLINE bool add_mvp_candidate(const encoder_state_t *state, } +static bool is_duplicate_candidate_ibc(const cu_info_t* cu1, const cu_info_t* cu2) +{ + if (!cu2) return false; + + if (cu1->inter.mv[0][0] != cu2->inter.mv[0][0] || + cu1->inter.mv[0][1] != cu2->inter.mv[0][1]) { + return false; + } + + + return true; +} + /** * \brief Get merge candidates for current block. * @@ -1175,10 +1187,12 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, // before the current one and the flag is not set when searching an SMP // block. if (b1->type == CU_IBC) { - inter_clear_cu_unused(b1); - mv_cand[candidates][0] = b1->inter.mv[0][0]; - mv_cand[candidates][1] = b1->inter.mv[0][1]; - candidates++; + if(!is_duplicate_candidate_ibc(b1, a1)) { + inter_clear_cu_unused(b1); + mv_cand[candidates][0] = b1->inter.mv[0][0]; + mv_cand[candidates][1] = b1->inter.mv[0][1]; + candidates++; + } } else { b1 = NULL; } @@ -1196,12 +1210,22 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row]; for (int i = 0; i < MIN(MAX_NUM_HMVP_CANDS,num_cand); i++) { cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + i]; + bool duplicate = false; + // Check that the HMVP candidate is not duplicate + if (is_duplicate_candidate_ibc(cand, a1)) { + duplicate = true; + } else if(is_duplicate_candidate_ibc(cand, b1)) { + duplicate = true; + } - mv_cand[candidates][0] = cand->inter.mv[0][0]; - mv_cand[candidates][1] = cand->inter.mv[0][1]; - candidates++; - if (candidates == IBC_MRG_MAX_NUM_CANDS) return; + // allow duplicates after the first hmvp lut item + if (!duplicate || i > 0) { + mv_cand[candidates][0] = cand->inter.mv[0][0]; + mv_cand[candidates][1] = cand->inter.mv[0][1]; + candidates++; + if (candidates == IBC_MRG_MAX_NUM_CANDS) return; + } } } @@ -1390,12 +1414,6 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua, } } -static INLINE mv_t get_scaled_mv(mv_t mv, int scale) -{ - int32_t scaled = scale * mv; - return CLIP(-131072, 131071, (scaled + 127 + (scaled < 0)) >> 8); -} - /** * \brief Try to add a temporal MVP or merge candidate. * @@ -1697,14 +1715,23 @@ static void hmvp_shift_lut(cu_info_t* lut, int32_t size, int32_t start, int32_t } } -static bool hmvp_push_lut_item(cu_info_t* lut, int32_t size, const cu_info_t* cu) { +static bool hmvp_push_lut_item(cu_info_t* lut, int32_t size, const cu_info_t* cu, bool ibc) { int8_t duplicate = -1; - for (int i = 0; i < size; i++) { - if (is_duplicate_candidate(cu, (const cu_info_t*)&lut[i])) { - duplicate = i; - break; + if (ibc) { + for (int i = 0; i < size; i++) { + if (is_duplicate_candidate_ibc(cu, (const cu_info_t *)&lut[i])) { + duplicate = i; + break; + } + } + } else { + for (int i = 0; i < size; i++) { + if (is_duplicate_candidate(cu, (const cu_info_t *)&lut[i])) { + duplicate = i; + break; + } } } // If duplicate found, shift the whole lut up to the duplicate, otherwise to the end @@ -1741,12 +1768,12 @@ void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_ if (cu->type == CU_IBC) { - bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], state->tile->frame->hmvp_size_ibc[ctu_row], cu); + bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], state->tile->frame->hmvp_size_ibc[ctu_row], cu, true); if(add_row && state->tile->frame->hmvp_size_ibc[ctu_row] < MAX_NUM_HMVP_CANDS) { state->tile->frame->hmvp_size_ibc[ctu_row]++; } } else { - bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut[ctu_row_mul_five], state->tile->frame->hmvp_size[ctu_row], cu); + bool add_row = hmvp_push_lut_item(&state->tile->frame->hmvp_lut[ctu_row_mul_five], state->tile->frame->hmvp_size[ctu_row], cu, false); if(add_row && state->tile->frame->hmvp_size[ctu_row] < MAX_NUM_HMVP_CANDS) { state->tile->frame->hmvp_size[ctu_row]++; } diff --git a/src/search_ibc.c b/src/search_ibc.c index c3ad713a..3d07dc61 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -93,7 +93,8 @@ static INLINE bool intmv_within_ibc_range(const ibc_search_info_t *info, int x, bool negative_values = x <= 0 && y <= 0; bool mv_range_valid = ((-y >= info->height) || (-x >= info->width)) && // Must be block height/width away from the block SUB_SCU(info->origin.y) >= -y && // Y vector must be inside the current CTU - (-x <= IBC_BUFFER_WIDTH-LCU_WIDTH); // X must be inside the buffer + (-x <= IBC_BUFFER_WIDTH-LCU_WIDTH) && // X must be inside the buffer + info->origin.x + x >= 0; // Don't go outside of the frame return negative_values && mv_range_valid; @@ -406,7 +407,6 @@ static int select_ibc_mv_cand(const encoder_state_t *state, return cand2_cost < cand1_cost ? 1 : 0; } - static double calc_ibc_mvd_cost(const encoder_state_t *state, int x, int y, @@ -878,9 +878,9 @@ static void search_pu_ibc(encoder_state_t * const state, cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; - uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T); uvg_inter_recon_cu(state, lcu, x, y, width, true, false); - uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true); + uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { continue; @@ -889,7 +889,7 @@ static void search_pu_ibc(encoder_state_t * const state, uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); uvg_quantize_lcu_residual(state, false, has_chroma, false, /*we are only checking for lack of coeffs so no need to check jccr*/ - x, y, depth, cur_pu, lcu, true); + x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); if (!cbf_is_set_any(cur_pu->cbf, depth)) { cur_pu->type = CU_IBC; cur_pu->merge_idx = merge_idx; From 34c7c432f98e7722795867f93403ba1c6d98d6b5 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 8 May 2023 11:58:40 +0300 Subject: [PATCH 17/36] [ibc] Fix deblocking for the IBC blocks --- src/filter.c | 16 +++++++++------- src/search.c | 5 +++-- src/search_ibc.c | 1 - 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/filter.c b/src/filter.c index edc9f1e1..2d51a17c 100644 --- a/src/filter.c +++ b/src/filter.c @@ -789,10 +789,10 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, cu_p->inter.mv[1][0] = 0; cu_p->inter.mv[1][1] = 0; } - const int refP0 = (cu_p->inter.mv_dir & 1) ? state->frame->ref_LX[0][cu_p->inter.mv_ref[0]] : -1; - const int refP1 = (cu_p->inter.mv_dir & 2) ? state->frame->ref_LX[1][cu_p->inter.mv_ref[1]] : -1; - const int refQ0 = (cu_q->inter.mv_dir & 1) ? state->frame->ref_LX[0][cu_q->inter.mv_ref[0]] : -1; - const int refQ1 = (cu_q->inter.mv_dir & 2) ? state->frame->ref_LX[1][cu_q->inter.mv_ref[1]] : -1; + const int refP0 = (cu_p->type == CU_IBC)?-2:(cu_p->inter.mv_dir & 1) ? state->frame->ref_LX[0][cu_p->inter.mv_ref[0]] : -1; + const int refP1 = (cu_p->type == CU_IBC)?-2:(cu_p->inter.mv_dir & 2) ? state->frame->ref_LX[1][cu_p->inter.mv_ref[1]] : -1; + const int refQ0 = (cu_q->type == CU_IBC)?-2:(cu_q->inter.mv_dir & 1) ? state->frame->ref_LX[0][cu_q->inter.mv_ref[0]] : -1; + const int refQ1 = (cu_q->type == CU_IBC)?-2:(cu_q->inter.mv_dir & 2) ? state->frame->ref_LX[1][cu_q->inter.mv_ref[1]] : -1; const mv_t* mvQ0 = cu_q->inter.mv[0]; const mv_t* mvQ1 = cu_q->inter.mv[1]; @@ -830,12 +830,14 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, } } else /*if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3)*/ { //is P-slice - if (cu_q->inter.mv_ref[cu_q->inter.mv_dir - 1] != cu_p->inter.mv_ref[cu_p->inter.mv_dir - 1]) { + const int refP = (cu_p->type == CU_IBC)?-2:state->frame->ref_LX[0][cu_p->inter.mv_ref[0]]; + const int refQ = (cu_q->type == CU_IBC)?-2:state->frame->ref_LX[0][cu_q->inter.mv_ref[0]]; + if (refP != refQ) { // Reference pictures are different strength = 1; } else if ( - ((abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][0] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][0]) >= mvdThreashold) || - (abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][1] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][1]) >= mvdThreashold))) { + ((abs(cu_q->inter.mv[0][0] - cu_p->inter.mv[0][0]) >= mvdThreashold) || + (abs(cu_q->inter.mv[0][1] - cu_p->inter.mv[0][1]) >= mvdThreashold))) { // Absolute motion vector diff between blocks >= 0.5 (Integer pixel) strength = 1; } diff --git a/src/search.c b/src/search.c index c30b3686..f4f040eb 100644 --- a/src/search.c +++ b/src/search.c @@ -1013,7 +1013,8 @@ static double search_cu( if (can_use_intra //&& state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc && cu_width > 4 - && (x >= cu_width || y >= cu_width)) { + && (x >= cu_width || y >= cu_width) + && !cur_cu->skipped) { cu_info_t backup_cu = *cur_cu; @@ -1114,7 +1115,7 @@ static double search_cu( } } - if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { + if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { double bits = 0; cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; diff --git a/src/search_ibc.c b/src/search_ibc.c index 3d07dc61..fe6a6da0 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -336,7 +336,6 @@ static void select_starting_point(ibc_search_info_t *info, } } - static double get_ibc_mvd_coding_cost(const encoder_state_t* state, const cabac_data_t* cabac, const int32_t mvd_hor, From 8aded6406bbaeaa7ff946399c76fb72324a37ed2 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 8 May 2023 12:26:06 +0300 Subject: [PATCH 18/36] [ibc] Fix issue in search --- src/search_ibc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/search_ibc.c b/src/search_ibc.c index fe6a6da0..0457e215 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -906,7 +906,9 @@ static void search_pu_ibc(encoder_state_t * const state, } // AMVP search starts here - amvp[0].size = 0; + amvp[0].size = 0; + amvp[1].size = 0; + amvp[2].size = 0; amvp[0].cost[0] = MAX_DOUBLE; From 31fbf453c1eb2c00cc32329f228caac2d9d1edc6 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 19 Jun 2023 09:17:46 +0300 Subject: [PATCH 19/36] [ibc] Fix IBCFlag writing with I-frames and clean up some code --- src/encode_coding_tree.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 4b41260c..68a020e8 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1482,7 +1482,8 @@ void uvg_encode_coding_tree( if (cur_cu->skipped) { - if (state->encoder_control->cfg.ibc) { // ToDo: Only for luma channel + if (state->encoder_control->cfg.ibc && state->frame->slicetype != UVG_SLICE_I) + { // ToDo: Only for luma channel // ToDo: Disable for blocks over 64x64 pixels int8_t ctx_ibc = 0; if (left_cu && left_cu->type == CU_IBC) ctx_ibc++; @@ -1835,35 +1836,38 @@ void uvg_encode_mvd(encoder_state_t * const state, const int8_t ver_abs_gr0 = mvd_ver != 0; const uint32_t mvd_hor_abs = abs(mvd_hor); const uint32_t mvd_ver_abs = abs(mvd_ver); + double temp_bits_out = 0.0; cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0]; - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor"); - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), temp_bits_out, "abs_mvd_greater0_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), temp_bits_out, "abs_mvd_greater0_flag_ver"); cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1]; if (hor_abs_gr0) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), temp_bits_out,"abs_mvd_greater1_flag_hor"); } if (ver_abs_gr0) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), temp_bits_out, "abs_mvd_greater1_flag_ver"); } if (hor_abs_gr0) { if (mvd_hor_abs > 1) { uint32_t bits = uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); - if(cabac->only_count) *bits_out += bits; + if(cabac->only_count) temp_bits_out += bits; } uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1; CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); - if (cabac->only_count) *bits_out += 1; + if (cabac->only_count) temp_bits_out += 1; } if (ver_abs_gr0) { if (mvd_ver_abs > 1) { uint32_t bits = uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); - if (cabac->only_count) *bits_out += bits; + if (cabac->only_count) temp_bits_out += bits; } uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1; CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); - if (cabac->only_count) *bits_out += 1; + if (cabac->only_count) temp_bits_out += 1; } + + if(bits_out) *bits_out = temp_bits_out; } From 68382f9e2517ac007f16f6ad70f125e19eea7684 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 19 Jun 2023 13:43:31 +0300 Subject: [PATCH 20/36] [ibc] Handle 4x4 block cases --- src/encode_coding_tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 68a020e8..7a3f401c 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1466,7 +1466,7 @@ void uvg_encode_coding_tree( } // Encode skip flag - if ((state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) && cu_width != 4) { + if ((state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc)) { int8_t ctx_skip = 0; @@ -1476,9 +1476,10 @@ void uvg_encode_coding_tree( if (above_cu && above_cu->skipped) { ctx_skip++; } - - cabac->cur_ctx = &(cabac->ctx.cu_skip_flag_model[ctx_skip]); - CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag"); + if (cu_width > 4 || state->encoder_control->cfg.ibc) { + cabac->cur_ctx = &(cabac->ctx.cu_skip_flag_model[ctx_skip]); + CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag"); + } if (cur_cu->skipped) { @@ -1518,7 +1519,7 @@ void uvg_encode_coding_tree( } // Prediction mode - if (state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc) { // ToDo: Only for luma channel + if ((state->frame->slicetype == UVG_SLICE_I || cu_width == 4) && state->encoder_control->cfg.ibc) { // ToDo: Only for luma channel // ToDo: Disable for blocks over 64x64 pixels int8_t ctx_ibc = 0; if (left_cu && left_cu->type == CU_IBC) ctx_ibc++; From 7252befc177428355ca5430aa943a9d913c2e0ce Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 21 Jun 2023 22:08:41 +0300 Subject: [PATCH 21/36] [ibc] Add a hashmap implementation for IBC hash search --- CMakeLists.txt | 2 +- src/hashmap.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++ src/hashmap.h | 60 ++++++++++++++++++++ 3 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 src/hashmap.c create mode 100644 src/hashmap.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 99fa8a88..2af1420e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,7 +200,7 @@ file(GLOB SOURCE_GROUP_CABAC RELATIVE ${PROJECT_SOURCE_DIR} "src/bitstream.*" "s file(GLOB SOURCE_GROUP_COMPRESSION RELATIVE ${PROJECT_SOURCE_DIR} "src/search*" "src/rdo.*" "src/fast_coeff*") file(GLOB SOURCE_GROUP_CONSTRAINT RELATIVE ${PROJECT_SOURCE_DIR} "src/constraint.*" "src/ml_*") file(GLOB SOURCE_GROUP_CONTROL RELATIVE ${PROJECT_SOURCE_DIR} "src/cfg.*" "src/encoder.*" "src/encoder_state-c*" "src/encoder_state-g*" "src/encoderstate*" "src/gop.*" "src/input_frame_buffer.*" "src/uvg266*" "src/rate_control.*" "src/mip_data.h") -file(GLOB SOURCE_GROUP_DATA_STRUCTURES RELATIVE ${PROJECT_SOURCE_DIR} "src/cu.*" "src/image.*" "src/imagelist.*" "src/videoframe.*") +file(GLOB SOURCE_GROUP_DATA_STRUCTURES RELATIVE ${PROJECT_SOURCE_DIR} "src/cu.*" "src/image.*" "src/imagelist.*" "src/videoframe.*" "src/hashmap.*") file(GLOB SOURCE_GROUP_EXTRAS RELATIVE ${PROJECT_SOURCE_DIR} "src/extras/*.h" "src/extras/*.c") file(GLOB_RECURSE SOURCE_GROUP_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c") file(GLOB SOURCE_GROUP_RECON RELATIVE ${PROJECT_SOURCE_DIR} "src/alf.*" "src/filter.*" "src/inter.*" "src/intra.*" "src/reshape.*" "src/sao.*" "src/scalinglist.*" "src/tables.*" "src/transform.*") diff --git a/src/hashmap.c b/src/hashmap.c new file mode 100644 index 00000000..3e9c0890 --- /dev/null +++ b/src/hashmap.c @@ -0,0 +1,145 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2023, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "hashmap.h" + +/** + * \brief This function creates a node for the uvg_hashmap. + * + * \param key the key of the node to be created + * \param value the value of the node to be created + * \return uvg_hashmap_node a node with the given key and value + */ +uvg_hashmap_node* uvg_hashmap_create_node(uint32_t key, uint32_t value) { + uvg_hashmap_node* new_node = (uvg_hashmap_node*)malloc(sizeof(uvg_hashmap_node)); + new_node->key = key; + new_node->value = value; + new_node->next = NULL; + return new_node; +} + +/** + * \brief This function creates a new uvg_hashmap with a given bucket size. + * + * \param bucket_size the size of the hashmap bucket + * \return uvg_hashmap a new uvg_hashmap with the given bucket size + */ +uvg_hashmap* uvg_hashmap_create(uint32_t bucket_size) +{ + uvg_hashmap* new_hashmap = (uvg_hashmap*)malloc(sizeof(uvg_hashmap)); + new_hashmap->bucket_size = bucket_size; + new_hashmap->table = (uvg_hashmap_node**)malloc(sizeof(uvg_hashmap_node*) * bucket_size); + for (int i = 0; i < bucket_size; i++) { + new_hashmap->table[i] = NULL; + } + return new_hashmap; +} + +/** + * \brief This function calculates the hash index for a given + * key and bucket size using the Jenkins hash function. + * + * \param key the key to be hashed + * \param bucket_size the size of the hashmap bucket + * \return the hashed index for the given key and bucket size. + */ +uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size) { + key ^= (key >> 20) ^ (key >> 12); + return (key ^ (key >> 7) ^ (key >> 4) ^ 2654435769U) % bucket_size; +} + +/** + * \brief This function inserts a new node into the hashmap. + * + * \param map the hashmap to insert the new node into + * \param key the key of the new node + * \param value the value of the new node + */ +void uvg_hashmap_insert(uvg_hashmap* map, uint32_t key, uint32_t value) { + uint32_t hash_index = uvg_hashmap_hash(key, map->bucket_size); + uvg_hashmap_node* new_node = uvg_hashmap_create_node(key, value); + new_node->next = map->table[hash_index]; + map->table[hash_index] = new_node; +} + +/** + * \brief This function searches the hashmap for the given key. + * + * \param map the hashmap to search in + * \param key the key to search for + * \return uvg_hashmap_node the node with the given key, NULL if not found. + */ +uvg_hashmap_node* uvg_hashmap_search(uvg_hashmap* map, uint32_t key) { + uint32_t hashIndex = uvg_hashmap_hash(key, map->bucket_size); + uvg_hashmap_node* temp = map->table[hashIndex]; + uvg_hashmap_node* return_node = NULL; + // Search key in chain and return all of them + while (temp) { + if (temp->key == key) { + uvg_hashmap_node* new_node = uvg_hashmap_create_node(key, temp->value); + if (return_node != NULL) { + new_node->next = return_node; + } + return_node = new_node; + } + temp = temp->next; + } + return return_node; +} + +/** + * \brief This function frees the memory of a given hashmap node. + * + * \param node the node to free the memory of. + */ +void uvg_hashmap_node_free(uvg_hashmap_node* node) +{ + while (node) { + uvg_hashmap_node* to_delete = node; + node = node->next; + free(to_delete); + } +} + +/** + * \brief This function frees the memory of a given hashmap. + * + * \param map the hashmap to free the memory of. + */ +void uvg_hashmap_free(uvg_hashmap* map) { + for (int i = 0; i < map->bucket_size; i++) { + uvg_hashmap_node* temp = map->table[i]; + uvg_hashmap_node_free(temp); + } + free(map->table); + free(map); +} diff --git a/src/hashmap.h b/src/hashmap.h new file mode 100644 index 00000000..61e868bf --- /dev/null +++ b/src/hashmap.h @@ -0,0 +1,60 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2023, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include +#include +#include + +typedef struct uvg_hashmap_node { + uint32_t key; + uint32_t value; + struct uvg_hashmap_node* next; +} uvg_hashmap_node; + +typedef struct uvg_hashmap { + uint32_t bucket_size; + uvg_hashmap_node** table; +} uvg_hashmap; + +uvg_hashmap_node* uvg_hashmap_create_node(uint32_t key, uint32_t value); + +uvg_hashmap* uvg_hashmap_create(uint32_t bucket_size); + +uint32_t uvg_hashmap_hash(uint32_t key); + +void uvg_hashmap_insert(uvg_hashmap* map, uint32_t key, uint32_t value); + +uvg_hashmap_node* uvg_hashmap_search(uvg_hashmap* map, uint32_t key); + +void uvg_hashmap_node_free(uvg_hashmap_node* node); + +void uvg_hashmap_free(uvg_hashmap* map); From a32a318d1875be462e5e3455476ac569654d1621 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 22 Jun 2023 14:36:05 +0300 Subject: [PATCH 22/36] [ibc] Add CRC32C functions, with SSE 4.2 optimized CRC calculations --- CMakeLists.txt | 4 +- src/hashmap.h | 5 ++ src/inter.c | 17 +++++ src/strategies/generic/picture-generic.c | 42 +++++++++++++ src/strategies/sse42/picture-sse42.c | 80 ++++++++++++++++++++++++ src/strategies/sse42/picture-sse42.h | 45 +++++++++++++ src/strategies/strategies-picture.c | 50 +++++++++++++++ src/strategies/strategies-picture.h | 7 +++ 8 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 src/strategies/sse42/picture-sse42.c create mode 100644 src/strategies/sse42/picture-sse42.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2af1420e..c0ec99c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,7 @@ target_include_directories(uvg266 PUBLIC src/strategies) file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c") file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c") +file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c") set(CLI_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h") @@ -175,7 +176,8 @@ else() list(APPEND ALLOW_AVX2 "x86_64" "AMD64") if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" ) - set_property( SOURCE ${LIB_SOURCES_STRATEGIES_SSE41} APPEND PROPERTY COMPILE_FLAGS "-msse4.1" ) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_SSE41} APPEND PROPERTY COMPILE_FLAGS "-msse4.1" ) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_SSE42} APPEND PROPERTY COMPILE_FLAGS "-msse4.2" ) endif() set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) diff --git a/src/hashmap.h b/src/hashmap.h index 61e868bf..45041eb3 100644 --- a/src/hashmap.h +++ b/src/hashmap.h @@ -34,6 +34,11 @@ #include #include +// The ratio of the hashmap bucket size to the maximum number of elements +#define UVG_HASHMAP_RATIO 0.35 +// Use Hashmap for 4x4 blocks +#define UVG_HASHMAP_BLOCKSIZE 4 + typedef struct uvg_hashmap_node { uint32_t key; uint32_t value; diff --git a/src/inter.c b/src/inter.c index 920c8d45..3bbef427 100644 --- a/src/inter.c +++ b/src/inter.c @@ -1666,6 +1666,13 @@ void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } +/** +• \brief Checks if two CUs have similar motion vectors. The function takes two CUs and compares their motion vectors. +• \param cu1 first CU +• \param cu2 second CU +• \return returns 0 if the two CUs have dissimilar motion vectors, and 1 if the motions are similar. +*/ + static bool is_duplicate_candidate(const cu_info_t* cu1, const cu_info_t* cu2) { if (!cu2) return false; @@ -1684,6 +1691,16 @@ static bool is_duplicate_candidate(const cu_info_t* cu1, const cu_info_t* cu2) return true; } +/** +* Adds a merge candidate to the list of possible candidates, if it is not a duplicate. +* +* \param cand The candidate to be added. +* \param possible_duplicate1 The first possible duplicate candidate to check for duplication. +* \param possible_duplicate2 The second possible duplicate candidate to check for duplication. +* \param merge_cand_out The output parameter to store the merge candidate information. +* +* @return Returns true if the merge candidate was added successfully, false otherwise. +*/ static bool add_merge_candidate(const cu_info_t *cand, const cu_info_t *possible_duplicate1, const cu_info_t *possible_duplicate2, diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index 09fce28a..c6b62909 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -793,9 +793,51 @@ static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* } } +INLINE static uint32_t uvg_crc32c_4_generic(uint32_t crc, const uvg_pixel *buf) +{ + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[0]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[1]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[2]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[3]) & 0xFF]; + return crc; +} + +static uint32_t uvg_crc32c_4x4_8bit_generic(const uvg_pixel *buf, uint32_t pic_stride) +{ + uint32_t crc = 0xFFFFFFFF; + crc = uvg_crc32c_4_generic(crc, &buf[0 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[1 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[2 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[3 * pic_stride]); + return crc ^ 0xFFFFFFFF; +} + +static uint32_t uvg_crc32c_4x4_16bit_generic(const uvg_pixel *buf, uint32_t pic_stride) +{ + uint32_t crc = 0xFFFFFFFF; + crc = uvg_crc32c_4_generic(crc, &buf[0 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[0 * pic_stride] + 4); + + crc = uvg_crc32c_4_generic(crc, &buf[1 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[1 * pic_stride] + 4); + + crc = uvg_crc32c_4_generic(crc, &buf[2 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[2 * pic_stride] + 4); + + crc = uvg_crc32c_4_generic(crc, &buf[3 * pic_stride]); + crc = uvg_crc32c_4_generic(crc, &buf[3 * pic_stride] + 4); + return crc ^ 0xFFFFFFFF; +} + int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) { bool success = true; + if (bitdepth == 8) { + success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_8bit_generic); + } else { + success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_16bit_generic); + } + success &= uvg_strategyselector_register(opaque, "reg_sad", "generic", 0, ®_sad_generic); diff --git a/src/strategies/sse42/picture-sse42.c b/src/strategies/sse42/picture-sse42.c new file mode 100644 index 00000000..11f864f0 --- /dev/null +++ b/src/strategies/sse42/picture-sse42.c @@ -0,0 +1,80 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2023, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "global.h" + +#if COMPILE_INTEL_SSE42 +#include "uvg266.h" + +#include "strategies/sse42/picture-sse42.h" + +#include +#include + +#include "strategyselector.h" + + + +static uint32_t uvg_crc32c_4x4_8bit_sse42(const uvg_pixel *buf, uint32_t pic_stride) +{ + uint32_t crc = 0xFFFFFFFF; + crc = _mm_crc32_u32(crc, *((uint32_t *)&buf[0 * pic_stride])); + crc = _mm_crc32_u32(crc, *((uint32_t *)&buf[1 * pic_stride])); + crc = _mm_crc32_u32(crc, *((uint32_t *)&buf[2 * pic_stride])); + crc = _mm_crc32_u32(crc, *((uint32_t *)&buf[3 * pic_stride])); + return crc ^ 0xFFFFFFFF; +} + +static uint32_t uvg_crc32c_4x4_16bit_sse42(const uvg_pixel *buf, uint32_t pic_stride) +{ + uint32_t crc = 0xFFFFFFFF; + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[0 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[1 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[2 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[3 * pic_stride])); + return crc ^ 0xFFFFFFFF; +} + + +#endif //COMPILE_INTEL_SSE42 + +int uvg_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) { + bool success = true; +#if COMPILE_INTEL_SSE42 + if (bitdepth == 8){ + success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_8bit_sse42); + } else { + success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_16bit_sse42); + } +#endif + return success; +} diff --git a/src/strategies/sse42/picture-sse42.h b/src/strategies/sse42/picture-sse42.h new file mode 100644 index 00000000..e1828b8c --- /dev/null +++ b/src/strategies/sse42/picture-sse42.h @@ -0,0 +1,45 @@ +#pragma once + +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2022, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Optimizations for SSE4.2. + */ + +#include "global.h" // IWYU pragma: keep +#include "uvg266.h" + + +int uvg_strategy_register_picture_sse42(void* opaque, uint8_t bitdepth); diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 8ff49246..2c0d65ae 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -41,6 +41,7 @@ // Define function pointers. +crc32c_4x4_func * uvg_crc32c_4x4; reg_sad_func * uvg_reg_sad = 0; cost_pixel_nxn_func * uvg_sad_4x4 = 0; @@ -83,6 +84,8 @@ pixel_var_func *uvg_pixel_var = 0; generate_residual_func *uvg_generate_residual = 0; + + int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) { bool success = true; @@ -206,3 +209,50 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n) return NULL; } } + +// Precomputed CRC32C lookup table for polynomial 0x04C11DB7 +const uint32_t uvg_crc_table[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, +}; \ No newline at end of file diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index eee3ffd5..605a8e2d 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -151,7 +151,14 @@ typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len); typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride); + +extern const uint32_t uvg_crc_table[256]; + +typedef uint32_t(crc32c_4x4_func)(const uvg_pixel *buf, uint32_t pic_stride); + // Declare function pointers. +extern crc32c_4x4_func * uvg_crc32c_4x4; + extern reg_sad_func * uvg_reg_sad; extern cost_pixel_nxn_func * uvg_sad_4x4; From 30321e6dd429429a1d5ac35de7ef7145db4c4c7c Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 22 Jun 2023 14:45:05 +0300 Subject: [PATCH 23/36] [ibc] Fix uvg_hashmap_hash definition --- src/hashmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashmap.h b/src/hashmap.h index 45041eb3..3728505b 100644 --- a/src/hashmap.h +++ b/src/hashmap.h @@ -54,7 +54,7 @@ uvg_hashmap_node* uvg_hashmap_create_node(uint32_t key, uint32_t value); uvg_hashmap* uvg_hashmap_create(uint32_t bucket_size); -uint32_t uvg_hashmap_hash(uint32_t key); +uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size); void uvg_hashmap_insert(uvg_hashmap* map, uint32_t key, uint32_t value); From 4b1f5ca7e2943c77d5494a33514a972e6e587603 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 22 Jun 2023 21:44:49 +0300 Subject: [PATCH 24/36] [ibc] Add the hashmap to the frame and fix some small issues with hashmap and crc32c - crc32c_4x4 strategy was not working, made some changes to the initialization --- src/hashmap.c | 38 ++++++++++++------------ src/hashmap.h | 22 +++++++------- src/strategies/generic/picture-generic.c | 4 +-- src/strategies/sse42/picture-sse42.c | 6 ++-- src/strategies/strategies-picture.c | 5 +++- src/strategies/strategies-picture.h | 1 + src/videoframe.c | 7 +++++ src/videoframe.h | 4 +++ 8 files changed, 52 insertions(+), 35 deletions(-) diff --git a/src/hashmap.c b/src/hashmap.c index 3e9c0890..840ab5e8 100644 --- a/src/hashmap.c +++ b/src/hashmap.c @@ -39,8 +39,8 @@ * \param value the value of the node to be created * \return uvg_hashmap_node a node with the given key and value */ -uvg_hashmap_node* uvg_hashmap_create_node(uint32_t key, uint32_t value) { - uvg_hashmap_node* new_node = (uvg_hashmap_node*)malloc(sizeof(uvg_hashmap_node)); +uvg_hashmap_node_t* uvg_hashmap_create_node(uint32_t key, uint32_t value) { + uvg_hashmap_node_t* new_node = (uvg_hashmap_node_t*)malloc(sizeof(uvg_hashmap_node_t)); new_node->key = key; new_node->value = value; new_node->next = NULL; @@ -53,11 +53,11 @@ uvg_hashmap_node* uvg_hashmap_create_node(uint32_t key, uint32_t value) { * \param bucket_size the size of the hashmap bucket * \return uvg_hashmap a new uvg_hashmap with the given bucket size */ -uvg_hashmap* uvg_hashmap_create(uint32_t bucket_size) +uvg_hashmap_t* uvg_hashmap_create(uint32_t bucket_size) { - uvg_hashmap* new_hashmap = (uvg_hashmap*)malloc(sizeof(uvg_hashmap)); + uvg_hashmap_t* new_hashmap = (uvg_hashmap_t*)malloc(sizeof(uvg_hashmap_t)); new_hashmap->bucket_size = bucket_size; - new_hashmap->table = (uvg_hashmap_node**)malloc(sizeof(uvg_hashmap_node*) * bucket_size); + new_hashmap->table = (uvg_hashmap_node_t**)malloc(sizeof(uvg_hashmap_node_t*) * bucket_size); for (int i = 0; i < bucket_size; i++) { new_hashmap->table[i] = NULL; } @@ -84,10 +84,10 @@ uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size) { * \param key the key of the new node * \param value the value of the new node */ -void uvg_hashmap_insert(uvg_hashmap* map, uint32_t key, uint32_t value) { +void uvg_hashmap_insert(uvg_hashmap_t* map, uint32_t key, uint32_t value) { uint32_t hash_index = uvg_hashmap_hash(key, map->bucket_size); - uvg_hashmap_node* new_node = uvg_hashmap_create_node(key, value); - new_node->next = map->table[hash_index]; + uvg_hashmap_node_t* new_node = uvg_hashmap_create_node(key, value); + new_node->next = (void*)map->table[hash_index]; map->table[hash_index] = new_node; } @@ -98,20 +98,20 @@ void uvg_hashmap_insert(uvg_hashmap* map, uint32_t key, uint32_t value) { * \param key the key to search for * \return uvg_hashmap_node the node with the given key, NULL if not found. */ -uvg_hashmap_node* uvg_hashmap_search(uvg_hashmap* map, uint32_t key) { +uvg_hashmap_node_t* uvg_hashmap_search(uvg_hashmap_t* map, uint32_t key) { uint32_t hashIndex = uvg_hashmap_hash(key, map->bucket_size); - uvg_hashmap_node* temp = map->table[hashIndex]; - uvg_hashmap_node* return_node = NULL; + uvg_hashmap_node_t* temp = map->table[hashIndex]; + uvg_hashmap_node_t* return_node = NULL; // Search key in chain and return all of them while (temp) { if (temp->key == key) { - uvg_hashmap_node* new_node = uvg_hashmap_create_node(key, temp->value); + uvg_hashmap_node_t* new_node = uvg_hashmap_create_node(key, temp->value); if (return_node != NULL) { - new_node->next = return_node; + new_node->next = (void*)return_node; } return_node = new_node; } - temp = temp->next; + temp = (uvg_hashmap_node_t*)temp->next; } return return_node; } @@ -121,11 +121,11 @@ uvg_hashmap_node* uvg_hashmap_search(uvg_hashmap* map, uint32_t key) { * * \param node the node to free the memory of. */ -void uvg_hashmap_node_free(uvg_hashmap_node* node) +void uvg_hashmap_node_free(uvg_hashmap_node_t* node) { while (node) { - uvg_hashmap_node* to_delete = node; - node = node->next; + uvg_hashmap_node_t* to_delete = node; + node = (uvg_hashmap_node_t*)node->next; free(to_delete); } } @@ -135,9 +135,9 @@ void uvg_hashmap_node_free(uvg_hashmap_node* node) * * \param map the hashmap to free the memory of. */ -void uvg_hashmap_free(uvg_hashmap* map) { +void uvg_hashmap_free(uvg_hashmap_t* map) { for (int i = 0; i < map->bucket_size; i++) { - uvg_hashmap_node* temp = map->table[i]; + uvg_hashmap_node_t* temp = map->table[i]; uvg_hashmap_node_free(temp); } free(map->table); diff --git a/src/hashmap.h b/src/hashmap.h index 3728505b..5881f627 100644 --- a/src/hashmap.h +++ b/src/hashmap.h @@ -1,3 +1,5 @@ +#pragma once + /***************************************************************************** * This file is part of uvg266 VVC encoder. * @@ -42,24 +44,24 @@ typedef struct uvg_hashmap_node { uint32_t key; uint32_t value; - struct uvg_hashmap_node* next; -} uvg_hashmap_node; + void* next; +} uvg_hashmap_node_t; typedef struct uvg_hashmap { uint32_t bucket_size; - uvg_hashmap_node** table; -} uvg_hashmap; + uvg_hashmap_node_t** table; +} uvg_hashmap_t; -uvg_hashmap_node* uvg_hashmap_create_node(uint32_t key, uint32_t value); +uvg_hashmap_node_t* uvg_hashmap_create_node(uint32_t key, uint32_t value); -uvg_hashmap* uvg_hashmap_create(uint32_t bucket_size); +uvg_hashmap_t* uvg_hashmap_create(uint32_t bucket_size); uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size); -void uvg_hashmap_insert(uvg_hashmap* map, uint32_t key, uint32_t value); +void uvg_hashmap_insert(uvg_hashmap_t* map, uint32_t key, uint32_t value); -uvg_hashmap_node* uvg_hashmap_search(uvg_hashmap* map, uint32_t key); +uvg_hashmap_node_t* uvg_hashmap_search(uvg_hashmap_t* map, uint32_t key); -void uvg_hashmap_node_free(uvg_hashmap_node* node); +void uvg_hashmap_node_free(uvg_hashmap_node_t* node); -void uvg_hashmap_free(uvg_hashmap* map); +void uvg_hashmap_free(uvg_hashmap_t* map); diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index c6b62909..b827c8a9 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -833,9 +833,9 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) { bool success = true; if (bitdepth == 8) { - success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_8bit_generic); + success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_8bit_generic); } else { - success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_16bit_generic); + success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_16bit_generic); } diff --git a/src/strategies/sse42/picture-sse42.c b/src/strategies/sse42/picture-sse42.c index 11f864f0..bd00d90f 100644 --- a/src/strategies/sse42/picture-sse42.c +++ b/src/strategies/sse42/picture-sse42.c @@ -67,13 +67,13 @@ static uint32_t uvg_crc32c_4x4_16bit_sse42(const uvg_pixel *buf, uint32_t pic_st #endif //COMPILE_INTEL_SSE42 -int uvg_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) { +int uvg_strategy_register_picture_sse42(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_SSE42 if (bitdepth == 8){ - success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_8bit_sse42); + success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_8bit_sse42); } else { - success &= uvg_strategyselector_register(opaque, "uvg_crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_16bit_sse42); + success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_16bit_sse42); } #endif return success; diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 2c0d65ae..d68f3173 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -41,7 +41,7 @@ // Define function pointers. -crc32c_4x4_func * uvg_crc32c_4x4; +crc32c_4x4_func * uvg_crc32c_4x4 = 0; reg_sad_func * uvg_reg_sad = 0; cost_pixel_nxn_func * uvg_sad_4x4 = 0; @@ -97,6 +97,9 @@ int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) { if (uvg_g_hardware_flags.intel_flags.sse41) { success &= uvg_strategy_register_picture_sse41(opaque, bitdepth); } + if (uvg_g_hardware_flags.intel_flags.sse42) { + success &= uvg_strategy_register_picture_sse42(opaque, bitdepth); + } if (uvg_g_hardware_flags.intel_flags.avx2) { success &= uvg_strategy_register_picture_avx2(opaque, bitdepth); } diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index 605a8e2d..ebb95b4f 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -205,6 +205,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n); cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n); #define STRATEGIES_PICTURE_EXPORTS \ + {"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \ {"reg_sad", (void**) &uvg_reg_sad}, \ {"sad_4x4", (void**) &uvg_sad_4x4}, \ {"sad_8x8", (void**) &uvg_sad_8x8}, \ diff --git a/src/videoframe.c b/src/videoframe.c index 8b3258ba..eef48d68 100644 --- a/src/videoframe.c +++ b/src/videoframe.c @@ -102,8 +102,15 @@ int uvg_videoframe_free(videoframe_t * const frame) FREE_POINTER(frame->sao_luma); FREE_POINTER(frame->sao_chroma); + if (frame->ibc_hashmap != NULL) { + uvg_hashmap_free(frame->ibc_hashmap); + frame->ibc_hashmap = NULL; + } + free(frame); + + return 1; } diff --git a/src/videoframe.h b/src/videoframe.h index 2e6bb8fb..140affee 100644 --- a/src/videoframe.h +++ b/src/videoframe.h @@ -41,6 +41,7 @@ #include "cu.h" #include "global.h" // IWYU pragma: keep #include "uvg266.h" +#include "hashmap.h" /** @@ -89,6 +90,9 @@ typedef struct videoframe bool source_lmcs_mapped; //!< \brief Indicate if source_lmcs is available and mapped to LMCS bool lmcs_top_level; //!< \brief Indicate that in this level the LMCS images are allocated bool rec_lmcs_mapped; //!< \brief Indicate if rec_lmcs is available and mapped to LMCS + + uvg_hashmap_t *ibc_hashmap; //!< \brief Hashmap for IBC hash search + } videoframe_t; From 76d66591c513cfe0e0dff0367edd2dadd19aa513 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 26 Jun 2023 21:24:10 +0300 Subject: [PATCH 25/36] [ibc] Implement CRC for 8x8 block and generate a full hashmap at the frame load --- src/encoderstate.c | 33 ++++++++++++++++++++++++ src/hashmap.c | 23 ++++++++++------- src/hashmap.h | 8 +++--- src/strategies/generic/picture-generic.c | 32 +++++++++++++++++++++++ src/strategies/sse42/picture-sse42.c | 17 +++++++++++- src/strategies/strategies-picture.c | 1 + src/strategies/strategies-picture.h | 3 +++ 7 files changed, 103 insertions(+), 14 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index 48839a84..bfd616f8 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -45,17 +45,20 @@ #include "encode_coding_tree.h" #include "encoder_state-bitstream.h" #include "filter.h" +#include "hashmap.h" #include "image.h" #include "rate_control.h" #include "sao.h" #include "search.h" #include "tables.h" +#include "threads.h" #include "threadqueue.h" #include "alf.h" #include "reshape.h" #include "strategies/strategies-picture.h" + /** * \brief Strength of QP adjustments when using adaptive QP for 360 video. * @@ -1936,6 +1939,36 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict assert(0); } + if (state->encoder_control->cfg.ibc != 0) { + int items = 0; + UVG_CLOCK_T hashmap_start_real_time; + UVG_CLOCK_T hashmap_end_real_time; + UVG_GET_TIME(&hashmap_start_real_time); + // Create a new hashmap with UVG_HASHMAP_RATIO buckets per 4x4 block + state->tile->frame->ibc_hashmap = uvg_hashmap_create( + (int)(((float)(state->tile->frame->width * state->tile->frame->height) / + (float)(UVG_HASHMAP_BLOCKSIZE * UVG_HASHMAP_BLOCKSIZE)) * UVG_HASHMAP_RATIO)); + + // Fill the hashmap with the current frame's block information + for (int y = 0; y < state->tile->frame->height; y += 1) { + for (int x = 0; x < state->tile->frame->width; x += 1) { + uint32_t crc = uvg_crc32c_8x8(state->tile->frame->source->y + y * state->tile->frame->width + x, state->tile->frame->width); + + //uint32_t found = uvg_hashmap_search_return_first(state->tile->frame->ibc_hashmap, crc); + //uvg_hashmap_node_t* found = uvg_hashmap_search(state->tile->frame->ibc_hashmap, crc); + + //if (found != NULL) uvg_hashmap_node_free(found); + + uvg_hashmap_insert(state->tile->frame->ibc_hashmap, crc, ((x&0xffff)<<16) | (y&0xffff)); + items++; + } + } + UVG_GET_TIME(&hashmap_end_real_time); + double wall_time = UVG_CLOCK_T_AS_DOUBLE(hashmap_end_real_time) - + UVG_CLOCK_T_AS_DOUBLE(hashmap_start_real_time); + fprintf(stderr, "Hashmap creation time: %f, items: %d, size %d\n", wall_time, items, state->tile->frame->ibc_hashmap->bucket_size); + } + if (state->encoder_control->cfg.lmcs_enable) { uvg_init_lmcs_aps(state->tile->frame->lmcs_aps, state->encoder_control->cfg.width, state->encoder_control->cfg.height, LCU_CU_WIDTH, LCU_CU_WIDTH, state->encoder_control->bitdepth); diff --git a/src/hashmap.c b/src/hashmap.c index 840ab5e8..c8c1c0fb 100644 --- a/src/hashmap.c +++ b/src/hashmap.c @@ -72,9 +72,11 @@ uvg_hashmap_t* uvg_hashmap_create(uint32_t bucket_size) * \param bucket_size the size of the hashmap bucket * \return the hashed index for the given key and bucket size. */ -uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size) { +static uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size) +{ key ^= (key >> 20) ^ (key >> 12); return (key ^ (key >> 7) ^ (key >> 4) ^ 2654435769U) % bucket_size; + //return key % bucket_size; } /** @@ -100,20 +102,21 @@ void uvg_hashmap_insert(uvg_hashmap_t* map, uint32_t key, uint32_t value) { */ uvg_hashmap_node_t* uvg_hashmap_search(uvg_hashmap_t* map, uint32_t key) { uint32_t hashIndex = uvg_hashmap_hash(key, map->bucket_size); - uvg_hashmap_node_t* temp = map->table[hashIndex]; - uvg_hashmap_node_t* return_node = NULL; - // Search key in chain and return all of them + return map->table[hashIndex]; +} + +uint32_t uvg_hashmap_search_return_first(uvg_hashmap_t* map, uint32_t key) +{ + uint32_t hashIndex = uvg_hashmap_hash(key, map->bucket_size); + uvg_hashmap_node_t* temp = map->table[hashIndex]; + // Search key in chain and return the first match while (temp) { if (temp->key == key) { - uvg_hashmap_node_t* new_node = uvg_hashmap_create_node(key, temp->value); - if (return_node != NULL) { - new_node->next = (void*)return_node; - } - return_node = new_node; + return temp->value; } temp = (uvg_hashmap_node_t*)temp->next; } - return return_node; + return -1; } /** diff --git a/src/hashmap.h b/src/hashmap.h index 5881f627..cb84c825 100644 --- a/src/hashmap.h +++ b/src/hashmap.h @@ -37,9 +37,9 @@ #include // The ratio of the hashmap bucket size to the maximum number of elements -#define UVG_HASHMAP_RATIO 0.35 +#define UVG_HASHMAP_RATIO 6.0 // Use Hashmap for 4x4 blocks -#define UVG_HASHMAP_BLOCKSIZE 4 +#define UVG_HASHMAP_BLOCKSIZE 8 typedef struct uvg_hashmap_node { uint32_t key; @@ -56,12 +56,14 @@ uvg_hashmap_node_t* uvg_hashmap_create_node(uint32_t key, uint32_t value); uvg_hashmap_t* uvg_hashmap_create(uint32_t bucket_size); -uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size); +//uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size); void uvg_hashmap_insert(uvg_hashmap_t* map, uint32_t key, uint32_t value); uvg_hashmap_node_t* uvg_hashmap_search(uvg_hashmap_t* map, uint32_t key); +uint32_t uvg_hashmap_search_return_first(uvg_hashmap_t* map, uint32_t key); + void uvg_hashmap_node_free(uvg_hashmap_node_t* node); void uvg_hashmap_free(uvg_hashmap_t* map); diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index b827c8a9..817befed 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -802,6 +802,20 @@ INLINE static uint32_t uvg_crc32c_4_generic(uint32_t crc, const uvg_pixel *buf) return crc; } + +INLINE static uint32_t uvg_crc32c_8_generic(uint32_t crc, const uvg_pixel *buf) +{ + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[0]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[1]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[2]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[3]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[4]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[5]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[6]) & 0xFF]; + crc = (crc >> 8) ^ uvg_crc_table[(crc ^ buf[7]) & 0xFF]; + return crc; +} + static uint32_t uvg_crc32c_4x4_8bit_generic(const uvg_pixel *buf, uint32_t pic_stride) { uint32_t crc = 0xFFFFFFFF; @@ -829,11 +843,29 @@ static uint32_t uvg_crc32c_4x4_16bit_generic(const uvg_pixel *buf, uint32_t pic_ return crc ^ 0xFFFFFFFF; } +static uint32_t uvg_crc32c_8x8_8bit_generic(const uvg_pixel *buf, uint32_t pic_stride) +{ + uint32_t crc = 0xFFFFFFFF; + crc = uvg_crc32c_8_generic(crc, &buf[0 * pic_stride]); + crc = uvg_crc32c_8_generic(crc, &buf[1 * pic_stride]); + + crc = uvg_crc32c_8_generic(crc, &buf[2 * pic_stride]); + crc = uvg_crc32c_8_generic(crc, &buf[3 * pic_stride]); + + crc = uvg_crc32c_8_generic(crc, &buf[4 * pic_stride]); + crc = uvg_crc32c_8_generic(crc, &buf[5 * pic_stride]); + + crc = uvg_crc32c_8_generic(crc, &buf[6 * pic_stride]); + crc = uvg_crc32c_8_generic(crc, &buf[7 * pic_stride]); + return crc ^ 0xFFFFFFFF; +} + int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) { bool success = true; if (bitdepth == 8) { success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_8bit_generic); + success &= uvg_strategyselector_register(opaque, "crc32c_8x8", "generic", 0, &uvg_crc32c_8x8_8bit_generic); } else { success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "generic", 0, &uvg_crc32c_4x4_16bit_generic); } diff --git a/src/strategies/sse42/picture-sse42.c b/src/strategies/sse42/picture-sse42.c index bd00d90f..0022af91 100644 --- a/src/strategies/sse42/picture-sse42.c +++ b/src/strategies/sse42/picture-sse42.c @@ -64,6 +64,20 @@ static uint32_t uvg_crc32c_4x4_16bit_sse42(const uvg_pixel *buf, uint32_t pic_st return crc ^ 0xFFFFFFFF; } +static uint32_t uvg_crc32c_8x8_8bit_sse42(const uvg_pixel *buf, uint32_t pic_stride) +{ + uint32_t crc = 0xFFFFFFFF; + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[0 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[1 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[2 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[3 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[4 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[5 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[6 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[7 * pic_stride])); + return crc ^ 0xFFFFFFFF; +} + #endif //COMPILE_INTEL_SSE42 @@ -71,7 +85,8 @@ int uvg_strategy_register_picture_sse42(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_SSE42 if (bitdepth == 8){ - success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_8bit_sse42); + success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_8bit_sse42); + success &= uvg_strategyselector_register(opaque, "crc32c_8x8", "sse42", 0, &uvg_crc32c_8x8_8bit_sse42); } else { success &= uvg_strategyselector_register(opaque, "crc32c_4x4", "sse42", 0, &uvg_crc32c_4x4_16bit_sse42); } diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index d68f3173..00ad9ccb 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -42,6 +42,7 @@ // Define function pointers. crc32c_4x4_func * uvg_crc32c_4x4 = 0; +crc32c_8x8_func * uvg_crc32c_8x8 = 0; reg_sad_func * uvg_reg_sad = 0; cost_pixel_nxn_func * uvg_sad_4x4 = 0; diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index ebb95b4f..88f52cfc 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -155,9 +155,11 @@ typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* extern const uint32_t uvg_crc_table[256]; typedef uint32_t(crc32c_4x4_func)(const uvg_pixel *buf, uint32_t pic_stride); +typedef uint32_t(crc32c_8x8_func)(const uvg_pixel *buf, uint32_t pic_stride); // Declare function pointers. extern crc32c_4x4_func * uvg_crc32c_4x4; +extern crc32c_8x8_func * uvg_crc32c_8x8; extern reg_sad_func * uvg_reg_sad; @@ -206,6 +208,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n); #define STRATEGIES_PICTURE_EXPORTS \ {"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \ + {"crc32c_8x8", (void **)&uvg_crc32c_8x8}, \ {"reg_sad", (void**) &uvg_reg_sad}, \ {"sad_4x4", (void**) &uvg_sad_4x4}, \ {"sad_8x8", (void**) &uvg_sad_8x8}, \ From 8cec02280f411168588c48cb578990c8c4de2310 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 28 Jun 2023 23:06:04 +0300 Subject: [PATCH 26/36] [ibc] Use IBC hashmap in LCU row basis --- src/encoder_state-ctors_dtors.c | 5 ++++ src/encoderstate.c | 47 ++++++++++++--------------------- src/hashmap.c | 2 ++ src/hashmap.h | 5 ++-- src/videoframe.c | 5 ---- src/videoframe.h | 5 ++-- 6 files changed, 29 insertions(+), 40 deletions(-) diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c index e2b55ada..965b3d08 100644 --- a/src/encoder_state-ctors_dtors.c +++ b/src/encoder_state-ctors_dtors.c @@ -131,7 +131,10 @@ static int encoder_state_config_tile_init(encoder_state_t * const state, state->tile->frame->ibc_buffer_y = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); state->tile->frame->ibc_buffer_u = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); state->tile->frame->ibc_buffer_v = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); + state->tile->frame->ibc_hashmap_row = malloc(sizeof(uvg_hashmap_t) * state->tile->frame->height_in_lcu); + for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { + state->tile->frame->ibc_hashmap_row[i] = uvg_hashmap_create((LCU_WIDTH * IBC_BUFFER_WIDTH)>>2); state->tile->frame->ibc_buffer_y[i] = (uvg_pixel*)malloc(IBC_BUFFER_SIZE * 3); // ToDo: we don't need this much, but it would also support 4:4:4 state->tile->frame->ibc_buffer_u[i] = &state->tile->frame->ibc_buffer_y[i][IBC_BUFFER_SIZE]; state->tile->frame->ibc_buffer_v[i] = &state->tile->frame->ibc_buffer_y[i][IBC_BUFFER_SIZE * 2]; @@ -219,7 +222,9 @@ static void encoder_state_config_tile_finalize(encoder_state_t * const state) { if (state->encoder_control->cfg.ibc) { for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { FREE_POINTER(state->tile->frame->ibc_buffer_y[i]); + uvg_hashmap_free(state->tile->frame->ibc_hashmap_row[i]); } + FREE_POINTER(state->tile->frame->ibc_hashmap_row); FREE_POINTER(state->tile->frame->ibc_buffer_y); FREE_POINTER(state->tile->frame->ibc_buffer_u); FREE_POINTER(state->tile->frame->ibc_buffer_v); diff --git a/src/encoderstate.c b/src/encoderstate.c index bfd616f8..e5c0c4d8 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -288,6 +288,23 @@ static void encoder_state_recdata_to_bufs(encoder_state_t * const state, const uint32_t ibc_block_width = MIN(LCU_WIDTH, (state->tile->frame->width-lcu->position_px.x)); const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y)); + int items = 0; + // Hash the current LCU to the IBC hashmap + for (int32_t xx = (lcu->position_px.x>8)?-6:0; xx < (int32_t)(ibc_block_width)-7; xx+=2) { + for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy++) { + int cur_x = lcu->position_px.x + xx; + int cur_y = lcu->position_px.y + yy; + uint32_t crc = uvg_crc32c_8x8(&frame->rec->y[cur_y * frame->rec->stride + cur_x],frame->rec->stride); + if (state->encoder_control->chroma_format != UVG_CSP_400) { + crc ^= uvg_crc32c_4x4(&frame->rec->u[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); + crc ^= uvg_crc32c_4x4(&frame->rec->v[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); + } + uvg_hashmap_insert(frame->ibc_hashmap_row[ibc_buffer_row], crc, ((cur_x&0xffff)<<16) | (cur_y&0xffff)); + items++; + } + } + //fprintf(stderr, "Inserted %d items to %dx%d at %dx%d\r\n", items, ibc_block_width, ibc_block_height, lcu->position_px.x, lcu->position_px.y); + uvg_pixels_blit(&frame->rec->y[lcu->position_px.y * frame->rec->stride + lcu->position_px.x], &frame->ibc_buffer_y[ibc_buffer_row][ibc_buffer_pos_x], ibc_block_width, ibc_block_height, @@ -1939,36 +1956,6 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict assert(0); } - if (state->encoder_control->cfg.ibc != 0) { - int items = 0; - UVG_CLOCK_T hashmap_start_real_time; - UVG_CLOCK_T hashmap_end_real_time; - UVG_GET_TIME(&hashmap_start_real_time); - // Create a new hashmap with UVG_HASHMAP_RATIO buckets per 4x4 block - state->tile->frame->ibc_hashmap = uvg_hashmap_create( - (int)(((float)(state->tile->frame->width * state->tile->frame->height) / - (float)(UVG_HASHMAP_BLOCKSIZE * UVG_HASHMAP_BLOCKSIZE)) * UVG_HASHMAP_RATIO)); - - // Fill the hashmap with the current frame's block information - for (int y = 0; y < state->tile->frame->height; y += 1) { - for (int x = 0; x < state->tile->frame->width; x += 1) { - uint32_t crc = uvg_crc32c_8x8(state->tile->frame->source->y + y * state->tile->frame->width + x, state->tile->frame->width); - - //uint32_t found = uvg_hashmap_search_return_first(state->tile->frame->ibc_hashmap, crc); - //uvg_hashmap_node_t* found = uvg_hashmap_search(state->tile->frame->ibc_hashmap, crc); - - //if (found != NULL) uvg_hashmap_node_free(found); - - uvg_hashmap_insert(state->tile->frame->ibc_hashmap, crc, ((x&0xffff)<<16) | (y&0xffff)); - items++; - } - } - UVG_GET_TIME(&hashmap_end_real_time); - double wall_time = UVG_CLOCK_T_AS_DOUBLE(hashmap_end_real_time) - - UVG_CLOCK_T_AS_DOUBLE(hashmap_start_real_time); - fprintf(stderr, "Hashmap creation time: %f, items: %d, size %d\n", wall_time, items, state->tile->frame->ibc_hashmap->bucket_size); - } - if (state->encoder_control->cfg.lmcs_enable) { uvg_init_lmcs_aps(state->tile->frame->lmcs_aps, state->encoder_control->cfg.width, state->encoder_control->cfg.height, LCU_CU_WIDTH, LCU_CU_WIDTH, state->encoder_control->bitdepth); diff --git a/src/hashmap.c b/src/hashmap.c index c8c1c0fb..c9d88d9c 100644 --- a/src/hashmap.c +++ b/src/hashmap.c @@ -44,6 +44,7 @@ uvg_hashmap_node_t* uvg_hashmap_create_node(uint32_t key, uint32_t value) { new_node->key = key; new_node->value = value; new_node->next = NULL; + new_node->size = 1; return new_node; } @@ -90,6 +91,7 @@ void uvg_hashmap_insert(uvg_hashmap_t* map, uint32_t key, uint32_t value) { uint32_t hash_index = uvg_hashmap_hash(key, map->bucket_size); uvg_hashmap_node_t* new_node = uvg_hashmap_create_node(key, value); new_node->next = (void*)map->table[hash_index]; + if (new_node->next != NULL) new_node->size = ((uvg_hashmap_node_t*)new_node->next)->size + 1; map->table[hash_index] = new_node; } diff --git a/src/hashmap.h b/src/hashmap.h index cb84c825..1294bb87 100644 --- a/src/hashmap.h +++ b/src/hashmap.h @@ -37,14 +37,15 @@ #include // The ratio of the hashmap bucket size to the maximum number of elements -#define UVG_HASHMAP_RATIO 6.0 +#define UVG_HASHMAP_RATIO 12.0 // Use Hashmap for 4x4 blocks #define UVG_HASHMAP_BLOCKSIZE 8 typedef struct uvg_hashmap_node { + void* next; uint32_t key; uint32_t value; - void* next; + uint32_t size; } uvg_hashmap_node_t; typedef struct uvg_hashmap { diff --git a/src/videoframe.c b/src/videoframe.c index eef48d68..f5a4d8af 100644 --- a/src/videoframe.c +++ b/src/videoframe.c @@ -102,11 +102,6 @@ int uvg_videoframe_free(videoframe_t * const frame) FREE_POINTER(frame->sao_luma); FREE_POINTER(frame->sao_chroma); - if (frame->ibc_hashmap != NULL) { - uvg_hashmap_free(frame->ibc_hashmap); - frame->ibc_hashmap = NULL; - } - free(frame); diff --git a/src/videoframe.h b/src/videoframe.h index 140affee..7f7e7581 100644 --- a/src/videoframe.h +++ b/src/videoframe.h @@ -81,7 +81,8 @@ typedef struct videoframe uvg_pixel **ibc_buffer_y; //!< \brief Intra Block Copy buffer for each LCU row uvg_pixel **ibc_buffer_u; //!< \brief Intra Block Copy buffer for each LCU row - uvg_pixel **ibc_buffer_v; //!< \brief Intra Block Copy buffer for each LCU row + uvg_pixel **ibc_buffer_v; //!< \brief Intra Block Copy buffer for each LCU row + uvg_hashmap_t **ibc_hashmap_row; //!< \brief Hashmap for IBC hash search for each LCU row cu_info_t* hmvp_lut_ibc; //!< \brief Look-up table for HMVP in IBC, one for each LCU row uint8_t* hmvp_size_ibc; //!< \brief HMVP IBC LUT size @@ -91,8 +92,6 @@ typedef struct videoframe bool lmcs_top_level; //!< \brief Indicate that in this level the LMCS images are allocated bool rec_lmcs_mapped; //!< \brief Indicate if rec_lmcs is available and mapped to LMCS - uvg_hashmap_t *ibc_hashmap; //!< \brief Hashmap for IBC hash search - } videoframe_t; From 15fb6f8183e84c5049cda4c28cc0319e57470e96 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 29 Jun 2023 21:57:06 +0300 Subject: [PATCH 27/36] [ibc] Add first version of the IBC hash search --- src/search_ibc.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 216 insertions(+), 1 deletion(-) diff --git a/src/search_ibc.c b/src/search_ibc.c index 0457e215..a981f6ca 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -1022,6 +1022,215 @@ static void search_pu_ibc(encoder_state_t * const state, } } +#include "threads.h" + +static int uvg_search_hash_cu_ibc(encoder_state_t* const state, + int x, int y, int depth, + lcu_t* lcu, + double* inter_cost, + double* inter_bitcost) +{ + const int x_cu = x; + const int y_cu = y; + const int part_mode = SIZE_2Nx2N; + const uvg_config *cfg = &state->encoder_control->cfg; + const videoframe_t * const frame = state->tile->frame; + const int width_cu = LCU_WIDTH >> depth; + const int width = PU_GET_W(part_mode, width_cu, 0); + const int height = PU_GET_H(part_mode, width_cu, 0); + + const bool merge_a1 = true; + const bool merge_b1 = true; + + ibc_search_info_t info; + + const int x_local = SUB_SCU(x); + const int y_local = SUB_SCU(y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + + cur_pu->type = CU_IBC; + cur_pu->part_size = part_mode; + cur_pu->depth = depth; + cur_pu->tr_depth = depth; + cur_pu->qp = state->qp; + + // Default to candidate 0 + CU_SET_MV_CAND(cur_pu, 0, 0); + + FILL(info, 0); + + info.state = state; + info.pic = frame->source; + info.origin.x = x; + info.origin.y = y; + info.width = width; + info.height = height; + info.mvd_cost_func = + cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; + info.optimized_sad = uvg_get_optimized_sad(width); + info.lcu = lcu; + + // Search for merge mode candidates + info.num_merge_cand = uvg_inter_get_merge_cand( + state, + x, + y, + width, + height, + merge_a1, + merge_b1, + info.merge_cand, + lcu); + + *inter_cost = MAX_DOUBLE; + + bool valid_mv = false; + + static double time_spent = 0.0; + static double search_time = 0.0; + static double crc_time = 0.0; + static int evaluations = 0; + static int hits = 0; + + + UVG_CLOCK_T hashmap_start_temp; + UVG_CLOCK_T hashmap_end_temp; + + + UVG_CLOCK_T hashmap_start_real_time; + UVG_CLOCK_T hashmap_end_real_time; + UVG_GET_TIME(&hashmap_start_real_time); + + int xx = x; + int yy = y; + + int best_mv_x = INT_MAX>>2; + int best_mv_y = INT_MAX>>2; + + int own_location = ((xx & 0xffff) << 16) | (yy & 0xffff); + + uint32_t ibc_buffer_row = yy / LCU_WIDTH; + + //UVG_GET_TIME(&hashmap_start_temp); + uint32_t crc = uvg_crc32c_8x8(&state->tile->frame->source->y[yy * state->tile->frame->source->stride + xx],state->tile->frame->source->stride); + if (state->encoder_control->chroma_format != UVG_CSP_400) { + crc ^= uvg_crc32c_4x4(&state->tile->frame->source->u[(yy >> 1) * (state->tile->frame->source->stride>>1) + (xx >> 1)],state->tile->frame->source->stride>>1); + crc ^= uvg_crc32c_4x4(&state->tile->frame->source->v[(yy >> 1) * (state->tile->frame->source->stride>>1) + (xx >> 1)],state->tile->frame->source->stride>>1); + } + /* UVG_GET_TIME(&hashmap_end_temp); + crc_time += UVG_CLOCK_T_AS_DOUBLE(hashmap_end_temp) - + UVG_CLOCK_T_AS_DOUBLE(hashmap_start_temp);*/ + + uvg_hashmap_node_t *result = uvg_hashmap_search(state->tile->frame->ibc_hashmap_row[ibc_buffer_row],crc); + + /* UVG_GET_TIME(&hashmap_start_temp); + search_time += UVG_CLOCK_T_AS_DOUBLE(hashmap_start_temp) - + UVG_CLOCK_T_AS_DOUBLE(hashmap_end_temp);*/ + + bool found_block = false; + + int hashes_found = 0; + + while (result != NULL) { + if (hashes_found == 0 && result->size > 1000) { + fprintf(stderr, "Found a block with %d elements\n", result->size); + //break; + } + if (result->key == crc && result->value != own_location) { + hashes_found++; + hits++; + int pos_x = result->value >> 16; + int pos_y = result->value & 0xffff; + int mv_x = pos_x - xx; + int mv_y = pos_y - yy; + if (pos_x <= xx - width && pos_y <= yy - height) { + valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y); + if (valid_mv) { + bool full_block = true; // Is the full block covered by the IBC? + for (int xxx = xx+UVG_HASHMAP_BLOCKSIZE; xxx < xx + width; xxx+=UVG_HASHMAP_BLOCKSIZE) { + for (int yyy = yy; yyy < yy + height; yyy += UVG_HASHMAP_BLOCKSIZE) { + uint32_t crc_other_blocks = uvg_crc32c_8x8(&state->tile->frame->source->y[yyy * state->tile->frame->source->stride + xxx],state->tile->frame->source->stride); + if (state->encoder_control->chroma_format != UVG_CSP_400) { + crc_other_blocks ^= uvg_crc32c_4x4(&state->tile->frame->source->u[(yyy >> 1) * (state->tile->frame->source->stride>>1) + (xxx >> 1)],state->tile->frame->source->stride>>1); + crc_other_blocks ^= uvg_crc32c_4x4(&state->tile->frame->source->v[(yyy >> 1) * (state->tile->frame->source->stride>>1) + (xxx >> 1)],state->tile->frame->source->stride>>1); + } + uvg_hashmap_node_t *result2 = uvg_hashmap_search(state->tile->frame->ibc_hashmap_row[ibc_buffer_row],crc_other_blocks); + evaluations++; + bool found_match = false; + while (result2) { + if (result2->key == crc_other_blocks) { + int pos_x_temp = (uint16_t)(result2->value >> 16); + int pos_y_temp = (uint16_t)(result2->value & 0xffff); + int mv_x_temp = pos_x_temp - xxx; + int mv_y_temp = pos_y_temp - yyy; + + if (mv_x_temp == mv_x && mv_y_temp == mv_y) { + found_match = true; + break; + } + } + result2 = result2->next; + } + if (!found_match) { + full_block = false; + break; + } + } + if (!full_block) { + break; + } + } + + if (full_block) { + + double cost = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt; + bool better_mv = cost < *inter_cost; + if (better_mv) { + best_mv_x = mv_x; + best_mv_y = mv_y; + *inter_cost = cost; + *inter_bitcost = 0.0; + fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y); + found_block = true; + break; + } + } + } + } + } + result = result->next; + } + + + UVG_GET_TIME(&hashmap_end_real_time); + time_spent += UVG_CLOCK_T_AS_DOUBLE(hashmap_end_real_time) - + UVG_CLOCK_T_AS_DOUBLE(hashmap_start_real_time); + //if (x > state->tile->frame->width-64 && y > state->tile->frame->height-64) + //fprintf(stderr, "Hashmap time: %f (crc: %f, search: %f) Evaluations: %d Hits: %d, hashed in this block: %d\n", time_spent,crc_time, search_time, evaluations, hits,hashes_found); + + if (!found_block) return; + + cur_pu->inter.mv[0][0] = best_mv_x << INTERNAL_MV_PREC; + cur_pu->inter.mv[0][1] = best_mv_y << INTERNAL_MV_PREC; + + + uvg_inter_recon_cu( + state, + lcu, + x, + y, + CU_WIDTH_FROM_DEPTH(depth), + true, + state->encoder_control->chroma_format != UVG_CSP_400); + + if (*inter_cost < MAX_DOUBLE) { + assert(fracmv_within_ibc_range( + &info, + cur_pu->inter.mv[0][0], + cur_pu->inter.mv[0][1])); + } + +} /** @@ -1046,7 +1255,13 @@ void uvg_search_cu_ibc(encoder_state_t * const state, { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; - + // Quick hashmap search + uvg_search_hash_cu_ibc(state, + x, y, depth, + lcu, + inter_cost, + inter_bitcost); + return; // Store information of L0, L1, and bipredictions. // Best cost will be left at MAX_DOUBLE if no valid CU is found. // These will be initialized by the following function. From 457d650f49fee50bf33879d361605a8b8c234006 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 19 Jul 2023 09:57:24 +0300 Subject: [PATCH 28/36] [ibc] Fix for CRC calculations - Input for the 64bit crc intrinsic was 32bit --- src/strategies/sse42/picture-sse42.c | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/strategies/sse42/picture-sse42.c b/src/strategies/sse42/picture-sse42.c index 0022af91..30549cb3 100644 --- a/src/strategies/sse42/picture-sse42.c +++ b/src/strategies/sse42/picture-sse42.c @@ -56,26 +56,26 @@ static uint32_t uvg_crc32c_4x4_8bit_sse42(const uvg_pixel *buf, uint32_t pic_str static uint32_t uvg_crc32c_4x4_16bit_sse42(const uvg_pixel *buf, uint32_t pic_stride) { - uint32_t crc = 0xFFFFFFFF; - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[0 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[1 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[2 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[3 * pic_stride])); - return crc ^ 0xFFFFFFFF; + uint64_t crc = 0xFFFFFFFF; + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[0 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[1 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[2 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[3 * pic_stride])); + return (uint32_t)(crc ^ 0xFFFFFFFF); } static uint32_t uvg_crc32c_8x8_8bit_sse42(const uvg_pixel *buf, uint32_t pic_stride) { - uint32_t crc = 0xFFFFFFFF; - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[0 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[1 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[2 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[3 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[4 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[5 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[6 * pic_stride])); - crc = _mm_crc32_u64(crc, *((uint32_t *)&buf[7 * pic_stride])); - return crc ^ 0xFFFFFFFF; + uint64_t crc = 0xFFFFFFFF; + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[0 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[1 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[2 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[3 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[4 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[5 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[6 * pic_stride])); + crc = _mm_crc32_u64(crc, *((uint64_t *)&buf[7 * pic_stride])); + return (uint32_t)(crc ^ 0xFFFFFFFF); } From 8ff184a6b3bec76dbf2be63606c985d4f960c0de Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 21 Jul 2023 20:14:23 +0300 Subject: [PATCH 29/36] [ibc] Fill the IBC hashmap at the start of LCU search and use reverse map for "pos to hash" --- src/encoder_state-ctors_dtors.c | 6 ++++ src/encoderstate.c | 60 +++++++++++++++++++++++---------- src/hashmap.c | 6 ++-- src/search.c | 2 ++ src/search_ibc.c | 48 +++++++++++++++----------- src/videoframe.h | 2 ++ 6 files changed, 84 insertions(+), 40 deletions(-) diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c index 965b3d08..526c3bc5 100644 --- a/src/encoder_state-ctors_dtors.c +++ b/src/encoder_state-ctors_dtors.c @@ -133,6 +133,10 @@ static int encoder_state_config_tile_init(encoder_state_t * const state, state->tile->frame->ibc_buffer_v = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); state->tile->frame->ibc_hashmap_row = malloc(sizeof(uvg_hashmap_t) * state->tile->frame->height_in_lcu); + state->tile->frame->ibc_hashmap_pos_to_hash_stride = ((state->tile->frame->width+UVG_HASHMAP_BLOCKSIZE-1)/ UVG_HASHMAP_BLOCKSIZE); + state->tile->frame->ibc_hashmap_pos_to_hash = malloc(sizeof(uint32_t) * + ((state->tile->frame->height+UVG_HASHMAP_BLOCKSIZE-1)/ UVG_HASHMAP_BLOCKSIZE) * state->tile->frame->ibc_hashmap_pos_to_hash_stride); + for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { state->tile->frame->ibc_hashmap_row[i] = uvg_hashmap_create((LCU_WIDTH * IBC_BUFFER_WIDTH)>>2); state->tile->frame->ibc_buffer_y[i] = (uvg_pixel*)malloc(IBC_BUFFER_SIZE * 3); // ToDo: we don't need this much, but it would also support 4:4:4 @@ -220,6 +224,8 @@ static void encoder_state_config_tile_finalize(encoder_state_t * const state) { FREE_POINTER(state->tile->frame->hmvp_size_ibc); if (state->encoder_control->cfg.ibc) { + FREE_POINTER(state->tile->frame->ibc_hashmap_pos_to_hash); + for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { FREE_POINTER(state->tile->frame->ibc_buffer_y[i]); uvg_hashmap_free(state->tile->frame->ibc_hashmap_row[i]); diff --git a/src/encoderstate.c b/src/encoderstate.c index e5c0c4d8..dd60fd03 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -288,23 +288,6 @@ static void encoder_state_recdata_to_bufs(encoder_state_t * const state, const uint32_t ibc_block_width = MIN(LCU_WIDTH, (state->tile->frame->width-lcu->position_px.x)); const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y)); - int items = 0; - // Hash the current LCU to the IBC hashmap - for (int32_t xx = (lcu->position_px.x>8)?-6:0; xx < (int32_t)(ibc_block_width)-7; xx+=2) { - for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy++) { - int cur_x = lcu->position_px.x + xx; - int cur_y = lcu->position_px.y + yy; - uint32_t crc = uvg_crc32c_8x8(&frame->rec->y[cur_y * frame->rec->stride + cur_x],frame->rec->stride); - if (state->encoder_control->chroma_format != UVG_CSP_400) { - crc ^= uvg_crc32c_4x4(&frame->rec->u[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); - crc ^= uvg_crc32c_4x4(&frame->rec->v[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); - } - uvg_hashmap_insert(frame->ibc_hashmap_row[ibc_buffer_row], crc, ((cur_x&0xffff)<<16) | (cur_y&0xffff)); - items++; - } - } - //fprintf(stderr, "Inserted %d items to %dx%d at %dx%d\r\n", items, ibc_block_width, ibc_block_height, lcu->position_px.x, lcu->position_px.y); - uvg_pixels_blit(&frame->rec->y[lcu->position_px.y * frame->rec->stride + lcu->position_px.x], &frame->ibc_buffer_y[ibc_buffer_row][ibc_buffer_pos_x], ibc_block_width, ibc_block_height, @@ -771,6 +754,49 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) if(state->frame->slicetype != UVG_SLICE_I) memcpy(original_lut, &state->tile->frame->hmvp_lut[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); if(state->encoder_control->cfg.ibc) memcpy(original_lut_ibc, &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); + + if (state->encoder_control->cfg.ibc) { + videoframe_t * const frame = state->tile->frame; + const uint32_t ibc_block_width = MIN(LCU_WIDTH, (state->tile->frame->width-lcu->position_px.x)); + const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y)); + int items = 0; + // Hash the current LCU to the IBC hashmap + for (int32_t xx = (lcu->position_px.x>8)?-7:0; xx < (int32_t)(ibc_block_width)-7; xx++) { + for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy++) { + int cur_x = lcu->position_px.x + xx; + int cur_y = lcu->position_px.y + yy; + + // Skip blocks that seem to be the same value for the whole block + uint64_t first_line = + *(uint64_t *)&frame->source->y[cur_y * frame->source->stride + cur_x]; + bool same_data = true; + for (int y_temp = 1; y_temp < 8; y_temp++) { + if (*(uint64_t *)&frame->source->y[(cur_y+y_temp) * frame->source->stride + cur_x] != first_line) { + same_data = false; + break; + } + } + + if (!same_data || (xx % UVG_HASHMAP_BLOCKSIZE == 0 && yy % UVG_HASHMAP_BLOCKSIZE == 0)) { + uint32_t crc = uvg_crc32c_8x8(&frame->source->y[cur_y * frame->source->stride + cur_x],frame->source->stride); + if (xx % UVG_HASHMAP_BLOCKSIZE == 0 && yy % UVG_HASHMAP_BLOCKSIZE == 0) { + state->tile->frame->ibc_hashmap_pos_to_hash[(cur_y / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + cur_x / UVG_HASHMAP_BLOCKSIZE] = crc; + } + /* + if (state->encoder_control->chroma_format != UVG_CSP_400) { + crc ^= uvg_crc32c_4x4(&frame->rec->u[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); + crc ^= uvg_crc32c_4x4(&frame->rec->v[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); + } + */ + uvg_hashmap_insert(frame->ibc_hashmap_row[ctu_row], crc, ((cur_x&0xffff)<<16) | (cur_y&0xffff)); + items++; + } + } + } + } + //fprintf(stderr, "Inserted %d items to %dx%d at %dx%d\r\n", items, ibc_block_width, ibc_block_height, lcu->position_px.x, lcu->position_px.y); + + //This part doesn't write to bitstream, it's only search, deblock and sao uvg_search_lcu(state, lcu->position_px.x, lcu->position_px.y, state->tile->hor_buf_search, state->tile->ver_buf_search, lcu->coeff); diff --git a/src/hashmap.c b/src/hashmap.c index c9d88d9c..73d8f891 100644 --- a/src/hashmap.c +++ b/src/hashmap.c @@ -75,9 +75,9 @@ uvg_hashmap_t* uvg_hashmap_create(uint32_t bucket_size) */ static uint32_t uvg_hashmap_hash(uint32_t key, uint32_t bucket_size) { - key ^= (key >> 20) ^ (key >> 12); - return (key ^ (key >> 7) ^ (key >> 4) ^ 2654435769U) % bucket_size; - //return key % bucket_size; + //key ^= (key >> 20) ^ (key >> 12); + //return (key ^ (key >> 7) ^ (key >> 4) ^ 2654435769U) % bucket_size; + return key % bucket_size; } /** diff --git a/src/search.c b/src/search.c index f4f040eb..ec803c1b 100644 --- a/src/search.c +++ b/src/search.c @@ -1012,6 +1012,7 @@ static double search_cu( // Simple IBC search if (can_use_intra //&& state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc + && cost > 1000 && cu_width > 4 && (x >= cu_width || y >= cu_width) && !cur_cu->skipped) { @@ -1029,6 +1030,7 @@ static double search_cu( cost = mode_cost; inter_bitcost = mode_bitcost; cur_cu->type = CU_IBC; + cur_cu->inter.mv_dir = 1; cur_cu->joint_cb_cr = 0; } else { *cur_cu = backup_cu; diff --git a/src/search_ibc.c b/src/search_ibc.c index a981f6ca..c6bef680 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -1082,7 +1082,8 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, info.merge_cand, lcu); - *inter_cost = MAX_DOUBLE; + *inter_cost = MAX_DOUBLE; + *inter_bitcost = MAX_DOUBLE; bool valid_mv = false; @@ -1112,11 +1113,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, uint32_t ibc_buffer_row = yy / LCU_WIDTH; //UVG_GET_TIME(&hashmap_start_temp); - uint32_t crc = uvg_crc32c_8x8(&state->tile->frame->source->y[yy * state->tile->frame->source->stride + xx],state->tile->frame->source->stride); - if (state->encoder_control->chroma_format != UVG_CSP_400) { + uint32_t crc = state->tile->frame->ibc_hashmap_pos_to_hash[(yy / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + xx / UVG_HASHMAP_BLOCKSIZE]; + //uvg_crc32c_8x8(&state->tile->frame->source->y[yy * state->tile->frame->source->stride + xx],state->tile->frame->source->stride); + /* if (state->encoder_control->chroma_format != UVG_CSP_400) { crc ^= uvg_crc32c_4x4(&state->tile->frame->source->u[(yy >> 1) * (state->tile->frame->source->stride>>1) + (xx >> 1)],state->tile->frame->source->stride>>1); crc ^= uvg_crc32c_4x4(&state->tile->frame->source->v[(yy >> 1) * (state->tile->frame->source->stride>>1) + (xx >> 1)],state->tile->frame->source->stride>>1); - } + }*/ /* UVG_GET_TIME(&hashmap_end_temp); crc_time += UVG_CLOCK_T_AS_DOUBLE(hashmap_end_temp) - UVG_CLOCK_T_AS_DOUBLE(hashmap_start_temp);*/ @@ -1133,11 +1135,11 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, while (result != NULL) { if (hashes_found == 0 && result->size > 1000) { - fprintf(stderr, "Found a block with %d elements\n", result->size); + //fprintf(stderr, "Found a block with %d elements\n", result->size); //break; } if (result->key == crc && result->value != own_location) { - hashes_found++; + hashes_found++; hits++; int pos_x = result->value >> 16; int pos_y = result->value & 0xffff; @@ -1149,11 +1151,13 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, bool full_block = true; // Is the full block covered by the IBC? for (int xxx = xx+UVG_HASHMAP_BLOCKSIZE; xxx < xx + width; xxx+=UVG_HASHMAP_BLOCKSIZE) { for (int yyy = yy; yyy < yy + height; yyy += UVG_HASHMAP_BLOCKSIZE) { - uint32_t crc_other_blocks = uvg_crc32c_8x8(&state->tile->frame->source->y[yyy * state->tile->frame->source->stride + xxx],state->tile->frame->source->stride); + uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[(yyy / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + xxx / UVG_HASHMAP_BLOCKSIZE]; + //uvg_crc32c_8x8(&state->tile->frame->source->y[yyy * state->tile->frame->source->stride + xxx],state->tile->frame->source->stride); + /* if (state->encoder_control->chroma_format != UVG_CSP_400) { crc_other_blocks ^= uvg_crc32c_4x4(&state->tile->frame->source->u[(yyy >> 1) * (state->tile->frame->source->stride>>1) + (xxx >> 1)],state->tile->frame->source->stride>>1); crc_other_blocks ^= uvg_crc32c_4x4(&state->tile->frame->source->v[(yyy >> 1) * (state->tile->frame->source->stride>>1) + (xxx >> 1)],state->tile->frame->source->stride>>1); - } + }*/ uvg_hashmap_node_t *result2 = uvg_hashmap_search(state->tile->frame->ibc_hashmap_row[ibc_buffer_row],crc_other_blocks); evaluations++; bool found_match = false; @@ -1180,19 +1184,22 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, break; } } + double cost = *inter_cost, bits = *inter_bitcost; + vector2d_t mv = { best_mv_x, best_mv_y}; + + if (full_block && check_mv_cost(&info, mv_x, mv_y, &cost, &bits, &mv)) { - if (full_block) { - - double cost = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt; + //double cost = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt; + //cost += bool better_mv = cost < *inter_cost; if (better_mv) { best_mv_x = mv_x; best_mv_y = mv_y; *inter_cost = cost; - *inter_bitcost = 0.0; + *inter_bitcost = bits; fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y); found_block = true; - break; + //break; } } } @@ -1256,12 +1263,13 @@ void uvg_search_cu_ibc(encoder_state_t * const state, *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; // Quick hashmap search - uvg_search_hash_cu_ibc(state, - x, y, depth, - lcu, - inter_cost, - inter_bitcost); - return; + /* uvg_search_hash_cu_ibc( + state, + x, y, depth, + lcu, + inter_cost, + inter_bitcost); + return;*/ // Store information of L0, L1, and bipredictions. // Best cost will be left at MAX_DOUBLE if no valid CU is found. // These will be initialized by the following function. @@ -1327,7 +1335,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state, uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400); - if (*inter_cost < MAX_DOUBLE) { + if (*inter_cost < MAX_DOUBLE) { assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } } diff --git a/src/videoframe.h b/src/videoframe.h index 7f7e7581..0a7509c6 100644 --- a/src/videoframe.h +++ b/src/videoframe.h @@ -83,6 +83,8 @@ typedef struct videoframe uvg_pixel **ibc_buffer_u; //!< \brief Intra Block Copy buffer for each LCU row uvg_pixel **ibc_buffer_v; //!< \brief Intra Block Copy buffer for each LCU row uvg_hashmap_t **ibc_hashmap_row; //!< \brief Hashmap for IBC hash search for each LCU row + uint32_t *ibc_hashmap_pos_to_hash; //!< \brief Hashmap reverse search for position to hash + uint32_t ibc_hashmap_pos_to_hash_stride; //!< \brief Hashmap position to hash stride cu_info_t* hmvp_lut_ibc; //!< \brief Look-up table for HMVP in IBC, one for each LCU row uint8_t* hmvp_size_ibc; //!< \brief HMVP IBC LUT size From 95dc4aa0cb2a386e2b87088040c29d8fe5f45949 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 21 Jul 2023 20:15:24 +0300 Subject: [PATCH 30/36] [ibc] Fix the IBC buffer limitation, 256x64 pixels allowed --- src/global.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/global.h b/src/global.h index 773f9c15..65ca2fa9 100644 --- a/src/global.h +++ b/src/global.h @@ -258,7 +258,7 @@ typedef int32_t mv_t; * */ #define IBC_MRG_MAX_NUM_CANDS 6 -#define IBC_BUFFER_SIZE (256*128) +#define IBC_BUFFER_SIZE (128*128) #define IBC_BUFFER_WIDTH (IBC_BUFFER_SIZE / LCU_WIDTH) #define IBC_BUFFER_WIDTH_C ((IBC_BUFFER_SIZE / LCU_WIDTH) >> 1) From 3cef3c0119b8e9593f9fd179a7f51ce15dbabd7c Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 21 Jul 2023 20:19:43 +0300 Subject: [PATCH 31/36] Change the hardcoded general_level_idc from 5.2 to 6.3 --- src/encoder_state-bitstream.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index b19ab758..832969fc 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -154,9 +154,8 @@ static void encoder_state_write_bitstream_PTL(bitstream_t *stream, // end Profile Tier //uint8_t level = state->encoder_control->cfg.level; - // ToDo: level hardcoded to 5.2 - WRITE_U(stream, 86, 8, "general_level_idc"); - + // ToDo: level hardcoded to 6.3 + WRITE_U(stream, 105, 8, "general_level_idc"); WRITE_U(stream, 0, 1, "ptl_frame_only_constraint_flag"); WRITE_U(stream, 0, 1, "ptl_multilayer_enabled_flag"); From 6fe629e66608d9230942f0afd30105b83f7fae0b Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 21 Jul 2023 20:40:00 +0300 Subject: [PATCH 32/36] [ibc] A bit of cleanup and skip IBC search if cost is already less than 500 --- src/search.c | 2 +- src/search_ibc.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/search.c b/src/search.c index ec803c1b..e7b9b737 100644 --- a/src/search.c +++ b/src/search.c @@ -1010,7 +1010,7 @@ static double search_cu( } // Simple IBC search - if (can_use_intra //&& state->frame->slicetype == UVG_SLICE_I + if (cost > 500 && can_use_intra //&& state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc && cost > 1000 && cu_width > 4 diff --git a/src/search_ibc.c b/src/search_ibc.c index c6bef680..6f8ff41b 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -767,6 +767,7 @@ static void search_pu_ibc(encoder_state_t * const state, cur_pu->depth = depth; cur_pu->tr_depth = depth; cur_pu->qp = state->qp; + cur_pu->inter.mv_dir = 1; // Default to candidate 0 CU_SET_MV_CAND(cur_pu, 0, 0); @@ -1295,20 +1296,18 @@ void uvg_search_cu_ibc(encoder_state_t * const state, cu_info_t *best_inter_pu = NULL; - // Find best AMVP PU - for (int mv_dir = 1; mv_dir < 4; ++mv_dir) { - int best_key = amvp[mv_dir - 1].keys[0]; + int best_key = amvp[0].keys[0]; - if (amvp[mv_dir - 1].size > 0 && - amvp[mv_dir - 1].cost[best_key] < *inter_cost) { + if (amvp[0].size > 0 && + amvp[0].cost[best_key] < *inter_cost) { - best_inter_pu = &amvp[mv_dir - 1].unit[best_key]; - *inter_cost = amvp[mv_dir - 1].cost[best_key]; - *inter_bitcost = amvp[mv_dir - 1].bits[best_key]; - } + best_inter_pu = &amvp[0].unit[best_key]; + *inter_cost = amvp[0].cost[best_key]; + *inter_bitcost = amvp[0].bits[best_key]; } + // Compare best AMVP against best Merge mode int best_merge_key = merge.keys[0]; From 6f4d538f4fe12990b69f1dbd3f106ea6d83fca2f Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 24 Jul 2023 22:07:22 +0300 Subject: [PATCH 33/36] [ibc] Clean up the ibc search, utilize hash based starting points if ibc=2 --- src/encoder_state-ctors_dtors.c | 12 ++- src/encoderstate.c | 6 +- src/search.c | 2 +- src/search_ibc.c | 149 +++++++++++++++++++++----------- 4 files changed, 111 insertions(+), 58 deletions(-) diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c index 526c3bc5..e951e27c 100644 --- a/src/encoder_state-ctors_dtors.c +++ b/src/encoder_state-ctors_dtors.c @@ -133,9 +133,11 @@ static int encoder_state_config_tile_init(encoder_state_t * const state, state->tile->frame->ibc_buffer_v = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu); state->tile->frame->ibc_hashmap_row = malloc(sizeof(uvg_hashmap_t) * state->tile->frame->height_in_lcu); - state->tile->frame->ibc_hashmap_pos_to_hash_stride = ((state->tile->frame->width+UVG_HASHMAP_BLOCKSIZE-1)/ UVG_HASHMAP_BLOCKSIZE); - state->tile->frame->ibc_hashmap_pos_to_hash = malloc(sizeof(uint32_t) * - ((state->tile->frame->height+UVG_HASHMAP_BLOCKSIZE-1)/ UVG_HASHMAP_BLOCKSIZE) * state->tile->frame->ibc_hashmap_pos_to_hash_stride); + if (state->encoder_control->cfg.ibc & 2) { + state->tile->frame->ibc_hashmap_pos_to_hash_stride = ((state->tile->frame->width+UVG_HASHMAP_BLOCKSIZE-1)/ UVG_HASHMAP_BLOCKSIZE); + state->tile->frame->ibc_hashmap_pos_to_hash = malloc(sizeof(uint32_t) * + ((state->tile->frame->height+UVG_HASHMAP_BLOCKSIZE-1)/ UVG_HASHMAP_BLOCKSIZE) * state->tile->frame->ibc_hashmap_pos_to_hash_stride); + } for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { state->tile->frame->ibc_hashmap_row[i] = uvg_hashmap_create((LCU_WIDTH * IBC_BUFFER_WIDTH)>>2); @@ -224,7 +226,9 @@ static void encoder_state_config_tile_finalize(encoder_state_t * const state) { FREE_POINTER(state->tile->frame->hmvp_size_ibc); if (state->encoder_control->cfg.ibc) { - FREE_POINTER(state->tile->frame->ibc_hashmap_pos_to_hash); + if (state->encoder_control->cfg.ibc & 2) { + FREE_POINTER(state->tile->frame->ibc_hashmap_pos_to_hash); + } for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) { FREE_POINTER(state->tile->frame->ibc_buffer_y[i]); diff --git a/src/encoderstate.c b/src/encoderstate.c index dd60fd03..383f5fa2 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -755,14 +755,14 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) if(state->encoder_control->cfg.ibc) memcpy(original_lut_ibc, &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five], sizeof(cu_info_t) * MAX_NUM_HMVP_CANDS); - if (state->encoder_control->cfg.ibc) { + if (state->encoder_control->cfg.ibc & 2) { videoframe_t * const frame = state->tile->frame; const uint32_t ibc_block_width = MIN(LCU_WIDTH, (state->tile->frame->width-lcu->position_px.x)); const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y)); int items = 0; // Hash the current LCU to the IBC hashmap - for (int32_t xx = (lcu->position_px.x>8)?-7:0; xx < (int32_t)(ibc_block_width)-7; xx++) { - for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy++) { + for (int32_t xx = 0; xx < (int32_t)(ibc_block_width)-7; xx+=UVG_HASHMAP_BLOCKSIZE) { + for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy+=UVG_HASHMAP_BLOCKSIZE) { int cur_x = lcu->position_px.x + xx; int cur_y = lcu->position_px.y + yy; diff --git a/src/search.c b/src/search.c index e7b9b737..ec803c1b 100644 --- a/src/search.c +++ b/src/search.c @@ -1010,7 +1010,7 @@ static double search_cu( } // Simple IBC search - if (cost > 500 && can_use_intra //&& state->frame->slicetype == UVG_SLICE_I + if (can_use_intra //&& state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.ibc && cost > 1000 && cu_width > 4 diff --git a/src/search_ibc.c b/src/search_ibc.c index 6f8ff41b..ba5fffba 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -322,11 +322,60 @@ static void select_starting_point(ibc_search_info_t *info, extra_mv.x >>= INTERNAL_MV_PREC; extra_mv.y >>= INTERNAL_MV_PREC; + int origin_x = info->origin.x; + int origin_y = info->origin.y; + + int ibc_origin_x = origin_x / UVG_HASHMAP_BLOCKSIZE; + int ibc_origin_y = origin_y / UVG_HASHMAP_BLOCKSIZE; + // Check mv_in if it's not one of the merge candidates. if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) { check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); } + if (info->state->encoder_control->cfg.ibc & 2) { + int own_location = ((origin_x & 0xffff) << 16) | (origin_y & 0xffff); + + uint32_t ibc_buffer_row = origin_y / LCU_WIDTH; + + uint32_t crc = info->state->tile->frame->ibc_hashmap_pos_to_hash + [(origin_y / UVG_HASHMAP_BLOCKSIZE) * + info->state->tile->frame->ibc_hashmap_pos_to_hash_stride + + origin_x / UVG_HASHMAP_BLOCKSIZE]; + + uvg_hashmap_node_t *result = uvg_hashmap_search( + info->state->tile->frame->ibc_hashmap_row[ibc_buffer_row], crc); + + while (result != NULL) { + if (result->key == crc && result->value != own_location) { + int pos_x = result->value >> 16; + int pos_y = result->value & 0xffff; + int mv_x = pos_x - origin_x; + int mv_y = pos_y - origin_y; + + int ibc_pos_x = pos_x / UVG_HASHMAP_BLOCKSIZE; + int ibc_pos_y = pos_y / UVG_HASHMAP_BLOCKSIZE; + + bool full_block = true; + for (int ibc_x = 0; ibc_x < info->width / UVG_HASHMAP_BLOCKSIZE; ibc_x++) { + for (int ibc_y = 0; ibc_y < info->height / UVG_HASHMAP_BLOCKSIZE; ibc_y++) { + uint32_t neighbor_crc = info->state->tile->frame->ibc_hashmap_pos_to_hash + [(ibc_pos_y+ibc_y) * info->state->tile->frame->ibc_hashmap_pos_to_hash_stride + ibc_pos_x + ibc_x]; + uint32_t other_crc = info->state->tile->frame->ibc_hashmap_pos_to_hash + [(ibc_origin_y+ibc_y) * info->state->tile->frame->ibc_hashmap_pos_to_hash_stride + ibc_origin_x + ibc_x]; + if (other_crc != neighbor_crc) { + full_block = false; + break; + } + } + if (!full_block) break; + } + if (full_block) check_mv_cost(info, mv_x, mv_y, best_cost, best_bits, best_mv); + } + result = result->next; + } + } + // Go through candidates for (int32_t i = 0; i < info->num_merge_cand; ++i) { int32_t x = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] + (1 << (INTERNAL_MV_PREC - 1)) ) >> INTERNAL_MV_PREC; @@ -896,7 +945,7 @@ static void search_pu_ibc(encoder_state_t * const state, cur_pu->skipped = true; merge->size = 1; - merge->cost[0] = 0.0; // TODO: Check this + merge->cost[0] = (merge_idx )* state->lambda_sqrt; // TODO: Check this merge->bits[0] = merge_idx; // TODO: Check this merge->unit[0] = *cur_pu; return; @@ -1010,11 +1059,13 @@ static void search_pu_ibc(encoder_state_t * const state, if(cfg->rdo < 2) { int predmode_ctx; + + const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3; const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); - const double total_bits = no_skip_flag + pred_mode_bits; + const double total_bits = ibc_flag + no_skip_flag + pred_mode_bits; if(amvp[0].size > 0) { const uint8_t best_key = amvp[0].keys[0]; amvp[0].bits[best_key] += total_bits; @@ -1083,8 +1134,8 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, info.merge_cand, lcu); - *inter_cost = MAX_DOUBLE; - *inter_bitcost = MAX_DOUBLE; + double ibc_cost = MAX_DOUBLE; + double ibc_bitcost = MAX_DOUBLE; bool valid_mv = false; @@ -1113,22 +1164,10 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, uint32_t ibc_buffer_row = yy / LCU_WIDTH; - //UVG_GET_TIME(&hashmap_start_temp); uint32_t crc = state->tile->frame->ibc_hashmap_pos_to_hash[(yy / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + xx / UVG_HASHMAP_BLOCKSIZE]; - //uvg_crc32c_8x8(&state->tile->frame->source->y[yy * state->tile->frame->source->stride + xx],state->tile->frame->source->stride); - /* if (state->encoder_control->chroma_format != UVG_CSP_400) { - crc ^= uvg_crc32c_4x4(&state->tile->frame->source->u[(yy >> 1) * (state->tile->frame->source->stride>>1) + (xx >> 1)],state->tile->frame->source->stride>>1); - crc ^= uvg_crc32c_4x4(&state->tile->frame->source->v[(yy >> 1) * (state->tile->frame->source->stride>>1) + (xx >> 1)],state->tile->frame->source->stride>>1); - }*/ - /* UVG_GET_TIME(&hashmap_end_temp); - crc_time += UVG_CLOCK_T_AS_DOUBLE(hashmap_end_temp) - - UVG_CLOCK_T_AS_DOUBLE(hashmap_start_temp);*/ uvg_hashmap_node_t *result = uvg_hashmap_search(state->tile->frame->ibc_hashmap_row[ibc_buffer_row],crc); - - /* UVG_GET_TIME(&hashmap_start_temp); - search_time += UVG_CLOCK_T_AS_DOUBLE(hashmap_start_temp) - - UVG_CLOCK_T_AS_DOUBLE(hashmap_end_temp);*/ + bool found_block = false; @@ -1150,33 +1189,15 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y); if (valid_mv) { bool full_block = true; // Is the full block covered by the IBC? - for (int xxx = xx+UVG_HASHMAP_BLOCKSIZE; xxx < xx + width; xxx+=UVG_HASHMAP_BLOCKSIZE) { - for (int yyy = yy; yyy < yy + height; yyy += UVG_HASHMAP_BLOCKSIZE) { - uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[(yyy / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + xxx / UVG_HASHMAP_BLOCKSIZE]; - //uvg_crc32c_8x8(&state->tile->frame->source->y[yyy * state->tile->frame->source->stride + xxx],state->tile->frame->source->stride); - /* - if (state->encoder_control->chroma_format != UVG_CSP_400) { - crc_other_blocks ^= uvg_crc32c_4x4(&state->tile->frame->source->u[(yyy >> 1) * (state->tile->frame->source->stride>>1) + (xxx >> 1)],state->tile->frame->source->stride>>1); - crc_other_blocks ^= uvg_crc32c_4x4(&state->tile->frame->source->v[(yyy >> 1) * (state->tile->frame->source->stride>>1) + (xxx >> 1)],state->tile->frame->source->stride>>1); - }*/ - uvg_hashmap_node_t *result2 = uvg_hashmap_search(state->tile->frame->ibc_hashmap_row[ibc_buffer_row],crc_other_blocks); - evaluations++; - bool found_match = false; - while (result2) { - if (result2->key == crc_other_blocks) { - int pos_x_temp = (uint16_t)(result2->value >> 16); - int pos_y_temp = (uint16_t)(result2->value & 0xffff); - int mv_x_temp = pos_x_temp - xxx; - int mv_y_temp = pos_y_temp - yyy; + for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) { + for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) { + uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[ + ((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE]; - if (mv_x_temp == mv_x && mv_y_temp == mv_y) { - found_match = true; - break; - } - } - result2 = result2->next; - } - if (!found_match) { + uint32_t crc_neighbor = state->tile->frame->ibc_hashmap_pos_to_hash[((pos_y+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (pos_x+offset_x) / UVG_HASHMAP_BLOCKSIZE]; + + bool found_match = false; + if (crc_neighbor != crc_other_blocks) { full_block = false; break; } @@ -1185,19 +1206,20 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, break; } } - double cost = *inter_cost, bits = *inter_bitcost; - vector2d_t mv = { best_mv_x, best_mv_y}; - - if (full_block && check_mv_cost(&info, mv_x, mv_y, &cost, &bits, &mv)) { + + if (full_block) { + double cost = ibc_cost, bits = ibc_bitcost; + vector2d_t mv = { best_mv_x, best_mv_y}; + cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, NULL, &bits); //double cost = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt; //cost += - bool better_mv = cost < *inter_cost; + bool better_mv = cost < ibc_cost; if (better_mv) { best_mv_x = mv_x; best_mv_y = mv_y; - *inter_cost = cost; - *inter_bitcost = bits; + ibc_cost = cost; + ibc_bitcost = bits; fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y); found_block = true; //break; @@ -1218,10 +1240,37 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, if (!found_block) return; + *inter_cost = 2; + *inter_bitcost = ibc_bitcost; + + uint32_t merge_idx; + int8_t merged = 0; + uint32_t temp_bitcost = 0; + + cur_pu->inter.mv[0][0] = best_mv_x << INTERNAL_MV_PREC; cur_pu->inter.mv[0][1] = best_mv_y << INTERNAL_MV_PREC; + + // Check every candidate to find a match + for(merge_idx = 0; merge_idx < (uint32_t)info.num_merge_cand; merge_idx++) { + if (info.merge_cand[merge_idx].dir == 1 && info.merge_cand[merge_idx].mv[0][0] == cur_pu->inter.mv[0][0] && + info.merge_cand[merge_idx].mv[0][1] == cur_pu->inter.mv[0][1]) { + temp_bitcost += merge_idx; + merged = 1; + fprintf(stderr, "Merged!\r\n"); + break; + } + } + + cur_pu->merged = merged; + cur_pu->merge_idx = merge_idx; + cur_pu->skipped = merged; + const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1); + ibc_cost += ibc_flag * state->lambda_sqrt; + ibc_bitcost += ibc_flag; + uvg_inter_recon_cu( state, lcu, From 0fefd3f621cb268078b113ea5df095e1aafee387 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 24 Jul 2023 22:55:52 +0300 Subject: [PATCH 34/36] [ibc] Add hash based starting point finder for reqular inter search *experimental* --- src/search_inter.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/search_inter.c b/src/search_inter.c index 345a83e9..6508995f 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -312,6 +312,55 @@ static void select_starting_point(inter_search_info_t *info, check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv); } + if (info->state->encoder_control->cfg.ibc & 2) { + int origin_x = info->origin.x; + int origin_y = info->origin.y; + + int ibc_origin_x = origin_x / UVG_HASHMAP_BLOCKSIZE; + int ibc_origin_y = origin_y / UVG_HASHMAP_BLOCKSIZE; + + int own_location = ((origin_x & 0xffff) << 16) | (origin_y & 0xffff); + + uint32_t ibc_buffer_row = origin_y / LCU_WIDTH; + + uint32_t crc = info->state->tile->frame->ibc_hashmap_pos_to_hash + [(origin_y / UVG_HASHMAP_BLOCKSIZE) * + info->state->tile->frame->ibc_hashmap_pos_to_hash_stride + + origin_x / UVG_HASHMAP_BLOCKSIZE]; + + uvg_hashmap_node_t *result = uvg_hashmap_search( + info->state->tile->frame->ibc_hashmap_row[ibc_buffer_row], crc); + + while (result != NULL) { + if (result->key == crc && result->value != own_location) { + int pos_x = result->value >> 16; + int pos_y = result->value & 0xffff; + int mv_x = pos_x - origin_x; + int mv_y = pos_y - origin_y; + + int ibc_pos_x = pos_x / UVG_HASHMAP_BLOCKSIZE; + int ibc_pos_y = pos_y / UVG_HASHMAP_BLOCKSIZE; + + bool full_block = true; + for (int ibc_x = 0; ibc_x < info->width / UVG_HASHMAP_BLOCKSIZE; ibc_x++) { + for (int ibc_y = 0; ibc_y < info->height / UVG_HASHMAP_BLOCKSIZE; ibc_y++) { + uint32_t neighbor_crc = info->state->tile->frame->ibc_hashmap_pos_to_hash + [(ibc_pos_y+ibc_y) * info->state->tile->frame->ibc_hashmap_pos_to_hash_stride + ibc_pos_x + ibc_x]; + uint32_t other_crc = info->state->tile->frame->ibc_hashmap_pos_to_hash + [(ibc_origin_y+ibc_y) * info->state->tile->frame->ibc_hashmap_pos_to_hash_stride + ibc_origin_x + ibc_x]; + if (other_crc != neighbor_crc) { + full_block = false; + break; + } + } + if (!full_block) break; + } + if (full_block) check_mv_cost(info, mv_x, mv_y, best_cost, best_bits, best_mv); + } + result = result->next; + } + } + // Go through candidates for (int32_t i = 0; i < info->num_merge_cand; ++i) { if (info->merge_cand[i].dir == 3) continue; From 20875a9819adade0e4122c6b773b1bfc072fed63 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 27 Jul 2023 10:29:55 +0300 Subject: [PATCH 35/36] [ibc] Calculate hashes every 4 pixels and change the IBC costs a bit --- src/encoderstate.c | 10 ++-------- src/search_ibc.c | 10 +++++----- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index 383f5fa2..089618c9 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -761,8 +761,8 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y)); int items = 0; // Hash the current LCU to the IBC hashmap - for (int32_t xx = 0; xx < (int32_t)(ibc_block_width)-7; xx+=UVG_HASHMAP_BLOCKSIZE) { - for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy+=UVG_HASHMAP_BLOCKSIZE) { + for (int32_t xx = 0; xx < (int32_t)(ibc_block_width)-7; xx+=UVG_HASHMAP_BLOCKSIZE>>1) { + for (int32_t yy = 0; yy < (int32_t)(ibc_block_height)-7; yy+=UVG_HASHMAP_BLOCKSIZE>>1) { int cur_x = lcu->position_px.x + xx; int cur_y = lcu->position_px.y + yy; @@ -782,12 +782,6 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) if (xx % UVG_HASHMAP_BLOCKSIZE == 0 && yy % UVG_HASHMAP_BLOCKSIZE == 0) { state->tile->frame->ibc_hashmap_pos_to_hash[(cur_y / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + cur_x / UVG_HASHMAP_BLOCKSIZE] = crc; } - /* - if (state->encoder_control->chroma_format != UVG_CSP_400) { - crc ^= uvg_crc32c_4x4(&frame->rec->u[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); - crc ^= uvg_crc32c_4x4(&frame->rec->v[(cur_y>>1) * (frame->rec->stride>>1) + (cur_x>>1)],frame->rec->stride>>1); - } - */ uvg_hashmap_insert(frame->ibc_hashmap_row[ctu_row], crc, ((cur_x&0xffff)<<16) | (cur_y&0xffff)); items++; } diff --git a/src/search_ibc.c b/src/search_ibc.c index ba5fffba..44f9ac50 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -489,7 +489,7 @@ static double calc_ibc_mvd_cost(const encoder_state_t *state, temp_bitcost += mvd_cost; } *bitcost = temp_bitcost; - return temp_bitcost * state->lambda_sqrt; + return temp_bitcost * state->lambda; } @@ -1008,7 +1008,7 @@ static void search_pu_ibc(encoder_state_t * const state, info->width, (best_mv.x >> INTERNAL_MV_PREC), (best_mv.y >> INTERNAL_MV_PREC)); - best_cost += best_bits * info->state->lambda_sqrt; + best_cost += best_bits * info->state->lambda; } @@ -1069,7 +1069,7 @@ static void search_pu_ibc(encoder_state_t * const state, if(amvp[0].size > 0) { const uint8_t best_key = amvp[0].keys[0]; amvp[0].bits[best_key] += total_bits; - amvp[0].cost[best_key] += (total_bits)* state->lambda_sqrt; + amvp[0].cost[best_key] += (total_bits)* state->lambda; } } } @@ -1240,7 +1240,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, if (!found_block) return; - *inter_cost = 2; + *inter_cost = ibc_cost; *inter_bitcost = ibc_bitcost; uint32_t merge_idx; @@ -1268,7 +1268,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1); - ibc_cost += ibc_flag * state->lambda_sqrt; + ibc_cost += ibc_flag * state->lambda; ibc_bitcost += ibc_flag; uvg_inter_recon_cu( From 18b4a8be796911431fea2610dd29ff13ec7a740e Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 27 Jul 2023 10:58:20 +0300 Subject: [PATCH 36/36] [ibc] Include the chroma in crc --- src/encoderstate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index 089618c9..cdadccf4 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -778,7 +778,11 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) } if (!same_data || (xx % UVG_HASHMAP_BLOCKSIZE == 0 && yy % UVG_HASHMAP_BLOCKSIZE == 0)) { - uint32_t crc = uvg_crc32c_8x8(&frame->source->y[cur_y * frame->source->stride + cur_x],frame->source->stride); + uint32_t crc = uvg_crc32c_8x8(&frame->source->y[cur_y * frame->source->stride + cur_x],frame->source->stride); + if (state->encoder_control->chroma_format != UVG_CSP_400) { + crc += uvg_crc32c_4x4(&frame->source->u[(cur_y>>1) * (frame->source->stride>>1) + (cur_x>>1)],frame->source->stride>>1); + crc += uvg_crc32c_4x4(&frame->source->v[(cur_y>>1) * (frame->source->stride>>1) + (cur_x>>1)],frame->source->stride>>1); + } if (xx % UVG_HASHMAP_BLOCKSIZE == 0 && yy % UVG_HASHMAP_BLOCKSIZE == 0) { state->tile->frame->ibc_hashmap_pos_to_hash[(cur_y / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + cur_x / UVG_HASHMAP_BLOCKSIZE] = crc; }