From 49f6e72c00603710c5a1f0b68446d644dc153620 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 11 Jan 2023 14:17:18 +0200 Subject: [PATCH] [DepQuant] WIP: doesn't crash but bitstream is illegal and quality a lot worse --- src/dep_quant.c | 103 ++++++++++++++++--------- src/dep_quant.h | 8 +- src/rate_control.c | 34 ++++++-- src/strategies/avx2/quant-avx2.c | 21 ++++- src/strategies/generic/quant-generic.c | 37 ++++++++- 5 files changed, 155 insertions(+), 48 deletions(-) diff --git a/src/dep_quant.c b/src/dep_quant.c index 54d656e0..70f0e28d 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -69,9 +69,9 @@ typedef struct int m_QShift; int64_t m_QAdd; int64_t m_QScale; - coeff_t m_maxQIdx; - coeff_t m_thresLast; - coeff_t m_thresSSbb; + int64_t m_maxQIdx; + int64_t m_thresLast; + int64_t m_thresSSbb; // distortion normalization int m_DistShift; int64_t m_DistAdd; @@ -135,9 +135,9 @@ typedef struct int8_t m_goRicePar; int8_t m_goRiceZero; int8_t m_stateId; - const uint32_t* m_sigFracBitsArray[2]; - const uint32_t* m_gtxFracBitsArray[6]; - struct common_context* m_commonCtx; + uint32_t *m_sigFracBitsArray[12]; + int32_t *m_gtxFracBitsArray[21]; + common_context* m_commonCtx; unsigned effWidth; unsigned effHeight; @@ -159,12 +159,12 @@ typedef struct int uvg_init_nb_info(encoder_control_t * encoder) { memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray)); memset(encoder->m_scanId2NbInfoOutArray, 0, sizeof(encoder->m_scanId2NbInfoOutArray)); - for (int hd = 0; hd <= 7; hd++) + for (int hd = 0; hd <= 6; hd++) { uint32_t raster2id[64 * 64] = {0}; - for (int vd = 0; vd <= 7; vd++) + for (int vd = 0; vd <= 6; vd++) { if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0)) { @@ -317,6 +317,21 @@ void uvg_dealloc_nb_info(encoder_control_t* encoder) { } +static INLINE int ceil_log2(uint64_t x) +{ + static const uint64_t t[6] = { 0xFFFFFFFF00000000ull, 0x00000000FFFF0000ull, 0x000000000000FF00ull, 0x00000000000000F0ull, 0x000000000000000Cull, 0x0000000000000002ull }; + int y = (((x & (x - 1)) == 0) ? 0 : 1); + int j = 32; + for (int i = 0; i < 6; i++) + { + int k = (((x & t[i]) == 0) ? 0 : j); + y += k; + x >>= k; + j >>= 1; + } + return y; +} + static void init_quant_block( const encoder_state_t* state, quant_block* qp, @@ -349,8 +364,8 @@ static void init_quant_block( maxLog2TrDynamicRange + 1, 8 * sizeof(int) + invShift - IQUANT_SHIFT - 1); qp->m_maxQIdx = (1 << (qIdxBD - 1)) - 4; - qp->m_thresLast = (coeff_t)(((int64_t)(4) << qp->m_QShift)); - qp->m_thresSSbb = (coeff_t)(((int64_t)(3) << qp->m_QShift)); + qp->m_thresLast = (((int64_t)(4) << (int64_t)qp->m_QShift)); + qp->m_thresSSbb = (((int64_t)(3) << (int64_t)qp->m_QShift)); // distortion calculation parameters const int64_t qScale = (gValue == -1) ? qp->m_QScale : gValue; const int nomDShift = @@ -363,8 +378,7 @@ static void init_quant_block( 1.0 / ((double)((int64_t)(1) << (-nomDShift)) * qScale2 * lambda) : (double)((int64_t)(1) << nomDShift) / (qScale2 * lambda)); const int64_t pow2dfShift = (int64_t)(nomDistFactor * qScale2) + 1; - assert(pow2dfShift > 0xfffffffll); - const int dfShift = uvg_math_ceil_log2(pow2dfShift); + const int dfShift = ceil_log2(pow2dfShift); qp->m_DistShift = 62 + qp->m_QShift - 2 * maxLog2TrDynamicRange - dfShift; qp->m_DistAdd = ((int64_t)(1) << qp->m_DistShift) >> 1; qp->m_DistStepAdd = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + qp->m_QShift)) + .5); @@ -404,8 +418,8 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data numCtx = (color == COLOR_Y? 21 : 11); for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) { const cabac_ctx_t * par_ctx = color == COLOR_Y ? &ctx->ctx.cu_parity_flag_model_luma[ctxId] : &ctx->ctx.cu_parity_flag_model_chroma[ctxId]; - const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId]; - const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId]; + const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId]; + const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId]; int32_t* cb = &rate_estimator->m_gtxFracBits[ctxId]; int32_t par0 = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 0); @@ -420,12 +434,13 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data } - static void xSetLastCoeffOffset( +static void xSetLastCoeffOffset( const encoder_state_t* const state, - const cu_info_t* const cur_tu, - const cu_loc_t* const cu_loc, - rate_estimator* rate_estimator, - const bool cb_cbf, + const cu_info_t* const cur_tu, + const int width, + const int height, + rate_estimator* rate_estimator, + const bool cb_cbf, const color_t compID) { int32_t cbfDeltaBits = 0; @@ -438,7 +453,7 @@ static void init_rate_esimator(rate_estimator * rate_estimator, const cabac_data if (useIntraSubPartitions) { bool rootCbfSoFar = false; bool isLastSubPartition = false; //TODO: isp check - uint32_t nTus = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, cur_tu->intra.isp_mode, true); + uint32_t nTus = uvg_get_isp_split_num(width, height, cur_tu->intra.isp_mode, true); if (isLastSubPartition) { //TransformUnit* tuPointer = tu.cu->firstTU; //for (int tuIdx = 0; tuIdx < nTus - 1; tuIdx++) { @@ -477,7 +492,7 @@ static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21}; for (unsigned xy = 0; xy < 2; xy++) { int32_t bitOffset = (xy ? cbfDeltaBits : 0); int32_t* lastBits = (xy ? rate_estimator->m_lastBitsY : rate_estimator->m_lastBitsX); - const unsigned size = (xy ? (compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height) : (compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width)); + const unsigned size = (xy ? (height) : (width)); const unsigned log2Size = uvg_math_ceil_log2(size); const bool useYCtx = (xy != 0); const cabac_ctx_t* const ctxSetLast = useYCtx ? @@ -504,15 +519,18 @@ static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21}; static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2], uint32_t gtx_frac_bits[6]) { - state->m_rdCost = INT64_MAX; + state->m_rdCost = INT64_MAX >> 1; state->m_numSigSbb = 0; state->m_remRegBins = 4; // just large enough for last scan pos state->m_refSbbCtxId = -1; state->m_sigFracBits[0] = sig_frac_bits[0]; state->m_sigFracBits[1] = sig_frac_bits[1]; - memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(gtx_frac_bits)); + memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(state->m_coeffFracBits)); state->m_goRicePar = 0; state->m_goRiceZero = 0; + + state->m_sbbFracBits[0] = 0; + state->m_sbbFracBits[1] = 0; } static INLINE void checkRdCostSkipSbbZeroOut(Decision *decision, const depquant_state * const state) @@ -841,7 +859,7 @@ static INLINE void updateStateEOS( state->m_numSigSbb = 1; memset(state->m_absLevelsAndCtxInit, 0, 16 * sizeof(uint8_t)); } - uint8_t* temp = (uint8_t*)(state->m_absLevelsAndCtxInit[scan_pos & 15]); + uint8_t* temp = (uint8_t*)(&state->m_absLevelsAndCtxInit[scan_pos & 15]); *temp = (uint8_t)MIN(255, decision->absLevel); update_common_context(state->m_commonCtx, scan_pos, width_in_sbb, height_in_sbb, sigNSbb, prvState, state); @@ -1099,7 +1117,8 @@ static void xDecideAndUpdate( int uvg_dep_quant( const encoder_state_t* const state, const cu_info_t* const cur_tu, - const cu_loc_t* const cu_loc, + const int width, + const int height, const coeff_t* srcCoeff, coeff_t* coeff_out, const color_t compID, @@ -1115,8 +1134,6 @@ int uvg_dep_quant( dep_quant_context.m_prevStates = &dep_quant_context.m_allStates[4]; dep_quant_context.m_skipStates = &dep_quant_context.m_allStates[8]; - const uint32_t width = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const uint32_t height = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const uint32_t lfnstIdx = tree_type != UVG_CHROMA_T || compID == COLOR_Y ? cur_tu->lfnst_idx : cur_tu->cr_lfnst_idx; @@ -1173,8 +1190,8 @@ int uvg_dep_quant( height >= 4) { firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15; } - const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6]; - const coeff_t thres = 4 << q_bits; + const int32_t default_quant_coeff = dep_quant_context.m_quant.m_QScale; + const int32_t thres = dep_quant_context.m_quant.m_thresLast; for (; firstTestPos >= 0; firstTestPos--) { coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff)); if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) { @@ -1188,7 +1205,7 @@ int uvg_dep_quant( //===== real init ===== rate_estimator rate_estimator; init_rate_esimator(&rate_estimator, &state->search_cabac, compID); - xSetLastCoeffOffset(state, cur_tu, cu_loc, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID); + xSetLastCoeffOffset(state, cur_tu, width, height, &rate_estimator, cbf_is_set(cur_tu->cbf, COLOR_U), compID); reset_common_context(&dep_quant_context.m_common_context, &rate_estimator, (width * height) >> 4, numCoeff); dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height]; @@ -1200,10 +1217,27 @@ int uvg_dep_quant( depquant_state_init(&dep_quant_context.m_allStates[k], rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]); dep_quant_context.m_allStates[k].effHeight = effectHeight; dep_quant_context.m_allStates[k].effWidth = effectWidth; + dep_quant_context.m_allStates[k].m_commonCtx = &dep_quant_context.m_common_context; + int i1 = (k & 3) ? (k & 3) - 1 : 0; + dep_quant_context.m_allStates[k].m_stateId = i1; + for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) { + dep_quant_context.m_allStates[k].m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[i1][i]; + } + for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) { + dep_quant_context.m_allStates[k].m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i]; + } } depquant_state_init(&dep_quant_context.m_startState, rate_estimator.m_sigFracBits[0][0], rate_estimator.m_gtxFracBits[0]); dep_quant_context.m_startState.effHeight = effectHeight; dep_quant_context.m_startState.effWidth = effectWidth; + dep_quant_context.m_startState.m_stateId = 0; + dep_quant_context.m_startState.m_commonCtx = &dep_quant_context.m_common_context; + for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) { + dep_quant_context.m_startState.m_sigFracBitsArray[i] = rate_estimator.m_sigFracBits[0][i]; + } + for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) { + dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator.m_gtxFracBits[i]; + } const uint32_t height_in_sbb = MAX(height >> 2, 1); @@ -1292,16 +1326,15 @@ int uvg_dep_quant( void uvg_dep_quant_dequant( const encoder_state_t* const state, - const cu_info_t* const cur_tu, - const cu_loc_t* const cu_loc, + const int block_type, + const int width, + const int height, const color_t compID, coeff_t* quant_coeff, coeff_t * coeff, bool enableScalingLists) { const encoder_control_t* const encoder = state->encoder_control; - const uint32_t width = compID == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const uint32_t height = compID == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int numCoeff = width * height; @@ -1339,7 +1372,7 @@ void uvg_dep_quant_dequant( int shift = IQUANT_SHIFT + 1 - qpPer - transformShift + (enableScalingLists ? 4 : 0); int invQScale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale ? 1 : 0][qpRem]; int add = (shift < 0) ? 0 : ((1 << shift) >> 1); - int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)(compID); + int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(compID); const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qpDQ % 6]; //----- dequant coefficients ----- diff --git a/src/dep_quant.h b/src/dep_quant.h index c237f81e..c3fb69a4 100644 --- a/src/dep_quant.h +++ b/src/dep_quant.h @@ -57,8 +57,9 @@ void uvg_dealloc_nb_info(encoder_control_t* encoder); void uvg_dep_quant_dequant( const encoder_state_t* const state, - const cu_info_t* const cur_tu, - const cu_loc_t* const cu_loc, + const int block_type, + const int width, + const int height, const color_t compID, coeff_t* quant_coeff, coeff_t* coeff, @@ -67,7 +68,8 @@ void uvg_dep_quant_dequant( int uvg_dep_quant( const encoder_state_t* const state, const cu_info_t* const cur_tu, - const cu_loc_t* const cu_loc, + const int width, + const int height, const coeff_t* srcCoeff, coeff_t* coeff_out, const color_t compID, diff --git a/src/rate_control.c b/src/rate_control.c index 67570565..0660f0ac 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -795,6 +795,9 @@ static double qp_to_lambda(encoder_state_t* const state, int qp) state->frame->QP + 2 + frame_allocation, est_qp); } + if(state->encoder_control->cfg.dep_quant) { + est_lambda *= pow(2, 0.25 / 3.0); + } state->lambda = est_lambda; state->lambda_sqrt = sqrt(est_lambda); @@ -820,7 +823,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp) // Since this value will be later combined with qp_pred, clip to half of that instead to be safe state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp); state->qp = CLIP_TO_QP(state->qp); - state->lambda = qp_to_lambda(state, state->qp); + double to_lambda = qp_to_lambda(state, state->qp); + if (state->encoder_control->cfg.dep_quant) { + to_lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = to_lambda; state->lambda_sqrt = sqrt(state->lambda); ctu->adjust_lambda = state->lambda; @@ -1103,7 +1110,12 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, pos.x = 0; } state->qp = CLIP_TO_QP(state->frame->QP + dqp); - state->lambda = qp_to_lambda(state, state->qp); + double to_lambda = qp_to_lambda(state, state->qp); + + if (state->encoder_control->cfg.dep_quant) { + to_lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = to_lambda; state->lambda_sqrt = sqrt(state->lambda); } else if (ctrl->cfg.target_bitrate > 0) { @@ -1138,6 +1150,9 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, state->frame->lambda * 1.5874010519681994, lambda); lambda = clip_lambda(lambda); + if (state->encoder_control->cfg.dep_quant) { + lambda *= pow(2, 0.25 / 3.0); + } state->lambda = lambda; state->lambda_sqrt = sqrt(lambda); @@ -1145,8 +1160,13 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, } else { state->qp = state->frame->QP; - state->lambda = state->frame->lambda; - state->lambda_sqrt = sqrt(state->frame->lambda); + double lambda = state->frame->lambda; + + if (state->encoder_control->cfg.dep_quant) { + lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = lambda; + state->lambda_sqrt = sqrt(lambda); } lcu->lambda = state->lambda; @@ -1170,7 +1190,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, // Since this value will be later combined with qp_pred, clip to half of that instead to be safe state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp); state->qp = CLIP_TO_QP(state->qp); - state->lambda = qp_to_lambda(state, state->qp); + double to_lambda = qp_to_lambda(state, state->qp); + if (state->encoder_control->cfg.dep_quant) { + to_lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = to_lambda; state->lambda_sqrt = sqrt(state->lambda); lcu->adjust_lambda = state->lambda; diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index d49b2f8f..00ef1248 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -707,8 +707,21 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, } // Quantize coeffs. (coeff -> coeff_out) - - if (state->encoder_control->cfg.rdoq_enable && + int abs_sum = 0; + if(!use_trskip && state->encoder_control->cfg.dep_quant) { + uvg_dep_quant( + state, + cur_cu, + width, + height, + coeff, + coeff_out, + color, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list); + } + else if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) { uvg_rdoq(state, coeff, coeff_out, width, height, color, @@ -792,6 +805,10 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip) { const encoder_control_t * const encoder = state->encoder_control; + if (encoder->cfg.dep_quant) { + uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list); + return; + } int32_t shift,add,coeff_q; int32_t n; const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index ed30b691..6a7d8990 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -316,8 +316,21 @@ int uvg_quant_cbcr_residual_generic( if(lfnst_idx) { uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode); } - - if (state->encoder_control->cfg.rdoq_enable && + int abs_sum = 0; + if (!false && state->encoder_control->cfg.dep_quant) { + uvg_dep_quant( + state, + cur_cu, + width, + height, + coeff, + coeff_out, + COLOR_U, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list); + } + else if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip)) { uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, @@ -497,7 +510,21 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, // Quantize coeffs. (coeff -> coeff_out) - if (state->encoder_control->cfg.rdoq_enable && + int abs_sum = 0; + if (!false && state->encoder_control->cfg.dep_quant) { + uvg_dep_quant( + state, + cur_cu, + width, + height, + coeff, + coeff_out, + COLOR_U, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list); + } + else if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) { uvg_rdoq(state, coeff, coeff_out, width, height, color, @@ -591,6 +618,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip) { const encoder_control_t * const encoder = state->encoder_control; + if(encoder->cfg.dep_quant) { + uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list); + return; + } int32_t shift,add,coeff_q; int32_t n; const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];