[jccr] Chroma transform search kinda working

This commit is contained in:
Joose Sainio 2022-05-25 13:47:02 +03:00
parent 27b730c2e9
commit f056178e80
12 changed files with 461 additions and 315 deletions

View file

@ -146,7 +146,7 @@ typedef struct
uint8_t skipped : 1; //!< \brief flag to indicate this block is skipped
uint8_t merged : 1; //!< \brief flag to indicate this block is merged
uint8_t merge_idx : 3; //!< \brief merge index
uint8_t tr_skip : 1; //!< \brief transform skip flag
uint8_t tr_skip : 3; //!< \brief transform skip flag
uint8_t tr_idx : 3; //!< \brief transform index
uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding

View file

@ -514,7 +514,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
// HEVC only supports transform_skip for Luma
// TODO: transform skip for chroma blocks
CABAC_BIN(cabac, 0, "transform_skip_flag");
CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
}
uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, NULL, cur_pu);
}
@ -522,7 +522,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
CABAC_BIN(cabac, 0, "transform_skip_flag");
CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
}
uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, NULL, cur_pu);
}

View file

@ -233,10 +233,10 @@ int uvg_init_rdcost_outfiles(const char *dir_path)
// As long as QP is a two-digit number, template and produced string should
// be equal in length ("%i" -> "22")
assert(RD_SAMPLING_MAX_LAST_QP <= 99);
assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
pthread_mutex_t *curr = outfile_mutex + qp;
@ -290,7 +290,7 @@ out:
*
* \param coeff coefficient array
* \param width coeff block width
* \param type data type (0 == luma)
* \param color data type (0 == luma)
*
* \returns bits needed to code input coefficients
*/
@ -298,7 +298,7 @@ static INLINE double get_coeff_cabac_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
int32_t width,
int32_t type,
color_t color,
int8_t scan_mode,
int8_t tr_skip,
cu_info_t* cur_tu)
@ -331,7 +331,7 @@ static INLINE double get_coeff_cabac_cost(
&cabac_copy,
coeff,
width,
type,
color,
scan_mode,
cur_tu,
&bits);
@ -341,7 +341,7 @@ static INLINE double get_coeff_cabac_cost(
&cabac_copy,
coeff,
width,
type,
color,
scan_mode,
&bits);
}
@ -383,7 +383,7 @@ static INLINE void save_accuracy(int qp, double ccc, uint32_t fast_cost)
*
* \param coeff coefficient array
* \param width coeff block width
* \param type data type (0 == luma)
* \param color data type (0 == luma)
*
* \returns number of bits needed to code coefficients
*/
@ -392,7 +392,7 @@ double uvg_get_coeff_cost(
const coeff_t *coeff,
cu_info_t* cur_tu,
int32_t width,
int32_t type,
color_t color,
int8_t scan_mode,
int8_t tr_skip)
{
@ -411,13 +411,13 @@ double uvg_get_coeff_cost(
uint64_t weights = uvg_fast_coeff_get_weights(state);
uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
if (check_accuracy) {
double ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode, tr_skip, cur_tu);
double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
save_accuracy(state->qp, ccc, fast_cost);
}
return fast_cost;
}
} else {
double ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode, tr_skip, cur_tu);
double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
if (save_cccs) {
save_ccc(state->qp, coeff, width * width, ccc);
}

View file

@ -64,7 +64,7 @@ double uvg_get_coeff_cost(
const coeff_t *coeff,
cu_info_t* cur_tu,
int32_t width,
int32_t type,
color_t color,
int8_t scan_mode,
int8_t tr_skip);

View file

@ -473,8 +473,8 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
const uint8_t tr_depth = tr_cu->tr_depth - depth;
const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U);
const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V);
const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);
cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
@ -488,7 +488,8 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
}
if(state->encoder_control->chroma_format != UVG_CSP_400 && !skip_residual_coding && (depth != 4 || (x_px % 8 && y_px % 8))) {
bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8));
if( !skip_residual_coding && has_chroma) {
if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
}
@ -522,10 +523,10 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
}
if (cb_flag_y | cb_flag_u | cb_flag_v) {
if (cb_flag_y || cb_flag_u || cb_flag_v) {
// TODO qp_delta_sign_flag
if ((cb_flag_u | cb_flag_v) && x_px % 8 == 0 && y_px % 8 == 0 && state->encoder_control->cfg.jccr) {
if ((cb_flag_u || cb_flag_v) && has_chroma && state->encoder_control->cfg.jccr) {
CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag");
}
}
@ -547,11 +548,11 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip);
coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1);
}
unsigned chroma_ssd = 0;
if(state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 != 0 && y_px % 8 != 0))) {
if(has_chroma) {
const vector2d_t lcu_px = { (x_px & ~7 ) / 2, (y_px & ~7) / 2 };
const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
@ -567,21 +568,22 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
chroma_width);
chroma_ssd = ssd_u + ssd_v;
}
coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], NULL, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2);
coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], NULL, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4);
{
coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, chroma_width, 2, scan_order, 0);
coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, chroma_width, 2, scan_order, 0);
}
} else {
else {
{
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
LCU_WIDTH_C, LCU_WIDTH_C,
width);
chroma_width);
int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
LCU_WIDTH_C, LCU_WIDTH_C,
chroma_width);
chroma_ssd = ssd_u_joint + ssd_v_joint;
coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0);
}
coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, chroma_width, COLOR_U, scan_order, 0);
}
}
if (kvz_is_mts_allowed(state, tr_cu)) {
@ -986,7 +988,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
if (ctrl->cfg.rdo >= 3) {
cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search);
if (intra_search.pred_cu.joint_cb_cr == 0) intra_search.pred_cu.joint_cb_cr = 4;
if (intra_search.pred_cu.joint_cb_cr == 0) {
intra_search.pred_cu.joint_cb_cr = 4;
cur_cu->tr_skip |= intra_search.pred_cu.tr_skip;
}
else cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr;
lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);

View file

@ -388,6 +388,14 @@ static double search_intra_trdepth(
}
pred_cu->intra.mode_chroma = -1;
pred_cu->joint_cb_cr = 4;
for (; trafo < num_transforms; trafo++) {
pred_cu->tr_idx = trafo;
if (trafo == MTS_SKIP) pred_cu->tr_skip |= 1;
else pred_cu->tr_skip &= 6; // Keep chroma tr_skip untouched allthough it probably won't matter here
if (mts_enabled)
{
pred_cu->mts_last_scan_pos = 0;
pred_cu->violates_mts_coeff_constraint = 0;
const int max_tb_size = TR_MAX_WIDTH;
// LFNST search params
@ -1350,6 +1358,7 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
}
#define IS_JCCR_MODE(t) ((t) != DCT7_CHROMA && (t) != CHROMA_TS)
static INLINE int64_t square(int x) {
return x * (int64_t)x;
}
@ -1362,6 +1371,181 @@ enum chroma_transforms {
JCCR_3 = 3,
};
static void generate_jccr_transforms(encoder_state_t* const state,
intra_search_data_t* chroma_data, int8_t width, int8_t mode_i,
int16_t u_resi[1024], int16_t v_resi[1024], coeff_t u_coeff[5120],
enum chroma_transforms transforms[5], const int trans_offset, int* num_transforms)
{
ALIGNED(64) int16_t temp_resi[LCU_WIDTH_C * LCU_WIDTH_C * 3];
int64_t costs[4];
costs[0] = INT64_MAX;
for (int jccr = chroma_data[mode_i].pred_cu.type == CU_INTRA ? 0 : 3; jccr < 4; jccr++) {
int64_t d1 = 0;
int64_t d2 = 0;
const int cbf_mask = jccr * (state->frame->jccr_sign ? -1 : 1);
int16_t *current_resi = &temp_resi[(jccr - 1) * trans_offset];
for (int y = 0; y < width; y++)
{
for (int x = 0; x < width; x++)
{
const int16_t cbx = u_resi[x + y * width], crx = v_resi[x + y * width];
if (cbf_mask == 2)
{
const int16_t resi = ((4 * cbx + 2 * crx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx - (resi >> 1));
}
else if (cbf_mask == -2)
{
const int16_t resi = ((4 * cbx - 2 * crx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx - (-resi >> 1));
}
else if (cbf_mask == 3)
{
const int16_t resi = ((cbx + crx) / 2);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx - resi);
}
else if (cbf_mask == -3)
{
const int16_t resi = ((cbx - crx) / 2);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx + resi);
}
else if (cbf_mask == 1)
{
const int16_t resi = ((4 * crx + 2 * cbx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - (resi >> 1)) + square(crx - resi);
}
else if (cbf_mask == -1)
{
const int16_t resi = ((4 * crx - 2 * cbx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - (-resi >> 1)) + square(crx - resi);
}
else
{
d1 += square(cbx);
d2 += square(crx);
}
}
}
costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1;
}
int64_t min_dist1 = costs[0];
int64_t min_dist2 = INT64_MAX;
int cbf_mask1 = 0;
int cbf_mask2 = 0;
for (int cbfMask = 1; cbfMask < 4; cbfMask++)
{
if (costs[cbfMask] < min_dist1)
{
cbf_mask2 = cbf_mask1; min_dist2 = min_dist1;
cbf_mask1 = cbfMask; min_dist1 = costs[cbf_mask1];
}
else if (costs[cbfMask] < min_dist2)
{
cbf_mask2 = cbfMask; min_dist2 = costs[cbf_mask2];
}
}
if (cbf_mask1)
{
kvz_transform2d(
state->encoder_control,
&temp_resi[(cbf_mask1 - 1) * trans_offset],
&u_coeff[*num_transforms * trans_offset],
width,
COLOR_U,
&chroma_data[cbf_mask1].pred_cu
);
transforms[(*num_transforms)] = cbf_mask1;
(*num_transforms)++;
}
if (cbf_mask2 && ((min_dist2 < (9 * min_dist1) / 8) || (!cbf_mask1 && min_dist2 < (3 * min_dist1) / 2)))
{
kvz_transform2d(
state->encoder_control,
&temp_resi[(cbf_mask2 - 1) * trans_offset],
&u_coeff[*num_transforms * trans_offset],
width,
COLOR_U,
&chroma_data[cbf_mask2].pred_cu
);
transforms[(*num_transforms)] = cbf_mask2;
(*num_transforms)++;
}
}
static void quantize_chroma(
encoder_state_t* const state,
int depth,
int8_t width,
int8_t height,
coeff_t u_coeff[5120],
coeff_t v_coeff[2048],
enum chroma_transforms transforms[5],
const int trans_offset,
int i,
coeff_t u_quant_coeff[1024],
coeff_t v_quant_coeff[1024],
const coeff_scan_order_t scan_order,
bool* u_has_coeffs,
bool* v_has_coeffs)
{
if (state->encoder_control->cfg.rdoq_enable &&
(transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
{
uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
scan_order, CU_INTRA, depth, 0);
int j;
for (j = 0; j < width * height; ++j) {
if (u_quant_coeff[j]) {
*u_has_coeffs = 1;
break;
}
}
if(transforms[i] == DCT7_CHROMA) {
uint16_t temp_cbf = 0;
if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
scan_order, CU_INTRA, depth, temp_cbf);
}
}
else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) {
uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U,scan_order);
uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,scan_order);
}
else {
uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
if(!IS_JCCR_MODE(transforms[i])) {
uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
}
}
for (int j = 0; j < width * height; ++j) {
if (u_quant_coeff[j]) {
*u_has_coeffs = 1;
break;
}
}
if (!IS_JCCR_MODE(transforms[i])) {
for (int j = 0; j < width * height; ++j) {
if (v_quant_coeff[j]) {
*v_has_coeffs = 1;
break;
}
}
}
}
int8_t uvg_search_intra_chroma_rdo(
encoder_state_t * const state,
int x_px,
@ -1384,159 +1568,103 @@ int8_t uvg_search_intra_chroma_rdo(
if (reconstruct_chroma) {
uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
uvg_intra_build_reference(log2_width, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
uvg_intra_build_reference(log2_width, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
cabac_data_t temp_cabac;
memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
int8_t width = MAX(4, LCU_CU_WIDTH >> (depth - 1));
int8_t height = MAX(4, LCU_CU_WIDTH >> (depth - 1));
const cu_loc_t loc = { x_px, y_px, width, height, width, height};
const int offset = (lcu_px.x >> 1) + (lcu_px.y >> 1)* LCU_WIDTH_C;
int8_t width = 1 << log2_width;
int8_t height = 1 << log2_width;
const cu_loc_t loc = { x_px &~7, y_px & ~7, width, height, width, height};
const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
for (int8_t i = 0; i < num_modes; ++i) {
const uint8_t mode = chroma_data[i].pred_cu.intra.mode_chroma;
for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) {
const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma;
double mode_bits = kvz_chroma_mode_bits(state, mode, luma_mode);
chroma_data[mode_i].cost = mode_bits * state->lambda;
if ((state->encoder_control->cfg.jccr ||
(state->encoder_control->cfg.trskip_enable &&
(1 << state->encoder_control->cfg.trskip_max_size) >= width)) &&
chroma_data[i].pred_cu.tr_depth == chroma_data[i].pred_cu.depth) {
chroma_data[mode_i].pred_cu.tr_depth == chroma_data[mode_i].pred_cu.depth) {
ALIGNED(64) kvz_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
ALIGNED(64) kvz_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
uvg_intra_predict(
state,
&refs[COLOR_U],
&refs[COLOR_U - 1],
&loc,
COLOR_U,
u_pred,
&chroma_data[i],
&chroma_data[mode_i],
lcu);
uvg_intra_predict(
state,
&refs[COLOR_V],
&refs[COLOR_V - 1],
&loc,
COLOR_V,
v_pred,
&chroma_data[i],
&chroma_data[mode_i],
lcu);
uvg_generate_residual(
&lcu->ref.u[offset],
u_pred,
u_resi,
width,
LCU_WIDTH_C,
width);
uvg_generate_residual(
&lcu->ref.v[offset],
v_pred,
v_resi,
width,
LCU_WIDTH_C,
width);
ALIGNED(64) coeff_t u_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 5];
ALIGNED(64) uint8_t u_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
uvg_transform2d(
state->encoder_control, u_resi, u_coeff, width, COLOR_U, &chroma_data[i].pred_cu
state->encoder_control, u_resi, u_coeff, width, COLOR_U, &chroma_data[mode_i].pred_cu
);
uvg_transform2d(
state->encoder_control, v_resi, v_coeff, width, COLOR_V, &chroma_data[i].pred_cu
state->encoder_control, v_resi, v_coeff, width, COLOR_V, &chroma_data[mode_i].pred_cu
);
enum chroma_transforms transforms[5];
transforms[0] = DCT7_CHROMA;
const int trans_offset = width * height;
int num_transforms = 1;
if(state->encoder_control->cfg.trskip_enable &&
(1 << state->encoder_control->cfg.trskip_max_size) >= width) {
const int can_use_tr_skip = state->encoder_control->cfg.trskip_enable &&
(1 << state->encoder_control->cfg.trskip_max_size) >= width;
if(can_use_tr_skip) {
uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width);
uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width);
transforms[num_transforms] = CHROMA_TS;
num_transforms++;
}
if(state->encoder_control->cfg.jccr) {
ALIGNED(64) int16_t temp_resi[LCU_WIDTH_C * LCU_WIDTH_C * 3];
int64_t costs[4];
costs[0] = INT64_MAX;
for (int jccr = chroma_data[i].pred_cu.type == CU_INTRA ? 0 : 3; jccr < 4; jccr++) {
int64_t d1 = 0;
int64_t d2 = 0;
const int cbf_mask = jccr * (state->frame->jccr_sign ? -1 : 1);
int16_t *current_resi = &temp_resi[(jccr - 1) + trans_offset];
for (int y = 0; y < width; y++)
{
for (int x = 0; x < width; x++)
{
int cbx = u_resi[x + y * width], crx = v_resi[x + y * width];
if (cbf_mask == 2)
{
const int resi = ((4 * cbx + 2 * crx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx - (resi >> 1));
}
else if (cbf_mask == -2)
{
const int resi = ((4 * cbx - 2 * crx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx - (resi >> 1));
}
else if (cbf_mask == 3)
{
const int resi = ((cbx + crx) / 2);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx - resi);
}
else if (cbf_mask == -3)
{
const int resi = ((cbx - crx) / 2);
current_resi[x + y * width] = resi;
d1 += square(cbx - resi) + square(crx + resi);
}
else if (cbf_mask == 1)
{
const int resi = ((4 * crx + 2 * cbx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - (resi >> 1)) + square(crx - resi);
}
else if (cbf_mask == -1)
{
const int resi = ((4 * crx - 2 * cbx) / 5);
current_resi[x + y * width] = resi;
d1 += square(cbx - (resi >> 1)) + square(crx - resi);
}
else
{
d1 += square(cbx);
d2 += square(crx);
}
}
}
costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1;
}
for(int jccr = chroma_data[i].pred_cu.type == CU_INTRA ? 1 : 3; jccr < 4; jccr++) {
if(costs[jccr] < costs[0]) {
uvg_transform2d(
state->encoder_control,
&temp_resi[(jccr - 1) + trans_offset],
&u_coeff[num_transforms * trans_offset],
generate_jccr_transforms(
state,
chroma_data,
width,
COLOR_U,
&chroma_data[jccr].pred_cu
);
transforms[num_transforms] = jccr;
num_transforms++;
mode_i,
u_resi,
v_resi,
u_coeff,
transforms,
trans_offset,
&num_transforms);
}
}
}
double best_u_cost = MAX_INT64;
double best_v_cost = MAX_INT64;
double best_combined_cost = MAX_INT64;
int best_u_index = -1;
int best_v_index = -1;
int best_combined_index = -1;
for(int trans = 0; trans < num_transforms; trans++) {
for(int i = 0; i < num_transforms; i++) {
coeff_t u_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
@ -1545,56 +1673,24 @@ int8_t uvg_search_intra_chroma_rdo(
uvg_get_scan_order(CU_INTRA, mode, depth);
bool u_has_coeffs = false;
bool v_has_coeffs = false;
if (state->encoder_control->cfg.rdoq_enable &&
(transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
{
uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
scan_order, CU_INTRA, depth, 0);
quantize_chroma(
state,
depth,
width,
height,
u_coeff,
v_coeff,
transforms,
trans_offset,
i,
u_quant_coeff,
v_quant_coeff,
scan_order,
&u_has_coeffs,
&v_has_coeffs);
int j;
for (j = 0; i < width * height; ++j) {
if (u_quant_coeff[num_transforms * trans_offset + j]) {
u_has_coeffs = 1;
break;
}
}
if(IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;
if(transforms[i] == DCT7_CHROMA) {
int16_t temp_cbf = 0;
if (u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
scan_order, CU_INTRA, depth, temp_cbf);
}
}
else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) {
uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U,scan_order);
uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,scan_order);
}
else {
uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
if(transforms[i] != CHROMA_TS && transforms[i] != DCT7_CHROMA) {
uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
}
}
for (int j = 0; i < width * height; ++j) {
if (u_quant_coeff[num_transforms * trans_offset + j]) {
u_has_coeffs = 1;
break;
}
}
if (transforms[i] != CHROMA_TS && transforms[i] != DCT7_CHROMA) {
for (int j = 0; i < width * height; ++j) {
if (v_quant_coeff[num_transforms * trans_offset + j]) {
v_has_coeffs = 1;
break;
}
}
}
if(u_has_coeffs) {
uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
CU_INTRA, transforms[i] == CHROMA_TS);
@ -1607,19 +1703,19 @@ int8_t uvg_search_intra_chroma_rdo(
}
if(transforms[i] != JCCR_1) {
for (int j = 0; j < width * height; j++) {
u_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)u_pred[j] + u_recon_resi[j]);
u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((kvz_pixel)(u_pred[j] + u_recon_resi[j]));
}
}
else {
for (int j = 0; j < width * height; j++) {
u_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)u_pred[j] + (u_recon_resi[j] >> 1));
u_recon[trans_offset * i + j] = CLIP_TO_PIXEL(u_pred[j] + ((state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]) >> 1));
}
}
}
else {
uvg_pixels_blit(u_pred, &u_recon[offset * i], width, height, width, width);
uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width);
}
if(v_has_coeffs && (transforms[i] == DCT7_CHROMA || transforms[i] == CHROMA_TS)) {
if(v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) {
uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
CU_INTRA, transforms[i] == CHROMA_TS);
if (transforms[i] != CHROMA_TS) {
@ -1630,70 +1726,135 @@ int8_t uvg_search_intra_chroma_rdo(
uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width);
}
for (int j = 0; j < width * height; j++) {
v_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)u_pred[j] + v_recon_resi[j]);
v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
}
}
else if(u_has_coeffs && (transforms[i] != DCT7_CHROMA && transforms[i] != CHROMA_TS)) {
if(transforms[i] != JCCR_2) {
else if(u_has_coeffs && IS_JCCR_MODE(transforms[i])) {
if (transforms[i] == JCCR_1) {
for (int j = 0; j < width * height; j++) {
v_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)v_pred[j] + (state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]));
v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + u_recon_resi[j]);
}
} else {
}
else if(transforms[i] == JCCR_3) {
for (int j = 0; j < width * height; j++) {
v_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)v_pred[j] + (state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]));
v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + (state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]));
}
}
else {
for (int j = 0; j < width * height; j++) {
v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + ((state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]) >> 1));
}
}
}
else {
uvg_pixels_blit(v_pred, &v_recon[offset * i], width, height, width, width);
uvg_pixels_blit(v_pred, &v_recon[trans_offset * i], width, height, width, width);
}
int ssd_u;
int ssd_v;
unsigned ssd_u = 0;
unsigned ssd_v = 0;
if (!state->encoder_control->cfg.lossless) {
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C,
ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
LCU_WIDTH_C, width,
width);
ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C,
ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
LCU_WIDTH_C, width,
width);
}
double u_bits = 0;
double v_bits = 0;
state->search_cabac.update = 1;
if(state->encoder_control->cfg.jccr) {
CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.joint_cb_cr[transforms[i]],
transforms[i] != DCT7_CHROMA && transforms[i] != CHROMA_TS, u_bits, "jccr_flag"
);
}
int cbf_u = transforms[i] & 2 || (u_has_coeffs && !(transforms[i] & 1));
CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_cb[0],
cbf_u, u_bits, "cbf_u"
);
int cbf_v = transforms[i] & 1 || (v_has_coeffs && !(transforms[i] & 2));
CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_cr[cbf_u],
transforms[i] & 1 || (v_has_coeffs && !(transforms[i] & 2)), v_bits, "cbf_v"
cbf_v, v_bits, "cbf_v"
);
memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
if (state->encoder_control->cfg.jccr && (cbf_u || cbf_v)) {
CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.joint_cb_cr[cbf_u * 2 + cbf_v - 1],
transforms[i] != DCT7_CHROMA && transforms[i] != CHROMA_TS, v_bits, "jccr_flag"
);
}
if (cbf_u || (transforms[i] == JCCR_1 && u_has_coeffs)) {
if(can_use_tr_skip && !IS_JCCR_MODE(transforms[i])) {
CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma,
transforms[i] == CHROMA_TS, u_bits, "tr_skip_u"
);
}
double coeff_cost = kvz_get_coeff_cost(
state,
u_quant_coeff,
NULL,
width,
COLOR_U,
scan_order,
transforms[i] == CHROMA_TS);
u_bits += coeff_cost;
}
if (cbf_v && !IS_JCCR_MODE(transforms[i])) {
if (can_use_tr_skip) {
CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma,
transforms[i] == CHROMA_TS, v_bits, "tr_skip_v"
);
}
v_bits += kvz_get_coeff_cost(
state,
v_quant_coeff,
NULL,
width,
COLOR_V,
scan_order,
transforms[i] == CHROMA_TS);
}
if(!IS_JCCR_MODE(transforms[i])) {
double u_cost = KVZ_CHROMA_MULT * ssd_u + u_bits * state->frame->lambda;
double v_cost = KVZ_CHROMA_MULT * ssd_v + v_bits * state->frame->lambda;
if(u_cost < best_u_cost) {
best_u_cost = u_cost;
best_u_index = transforms[i];
}
if(v_cost < best_v_cost) {
best_v_cost = v_cost;
best_v_index = transforms[i];
}
}
else {
double cost = KVZ_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->frame->lambda;
if (cost < best_combined_cost) {
best_combined_cost = cost;
best_combined_index = transforms[i];
}
}
memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
}
if(best_u_cost + best_v_cost < best_combined_cost) {
chroma_data[mode_i].pred_cu.joint_cb_cr = 0;
chroma_data[mode_i].pred_cu.tr_skip |= (best_u_index == CHROMA_TS) << COLOR_U;
chroma_data[mode_i].pred_cu.tr_skip |= (best_v_index == CHROMA_TS) << COLOR_V;
chroma_data[mode_i].cost += best_u_cost + best_v_cost;
}
else {
chroma_data[mode_i].pred_cu.joint_cb_cr = best_combined_index;
chroma_data[mode_i].cost += best_combined_cost;
}
}
else {
state->search_cabac.update = 1;
chroma_data[mode_i].cost = mode_bits * state->lambda;
uvg_intra_recon_cu(state,
x_px, y_px,
depth, &chroma_data[i],
&chroma_data[i].pred_cu,
depth, &chroma_data[mode_i],
&chroma_data[mode_i].pred_cu,
lcu);
double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode);
chroma_data[i].cost = mode_bits * state->lambda;
if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) {
chroma_data[i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu);
chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[mode_i].pred_cu, lcu);
} else {
uvg_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu, &chroma_data[i].cost);
uvg_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[mode_i].pred_cu, lcu, &chroma_data[mode_i].cost);
}
memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
}
@ -1708,6 +1869,7 @@ int8_t uvg_search_intra_chroma_rdo(
return 100;
}
#undef IS_JCCR_MODE
int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
const int x_px, const int y_px,
@ -2090,5 +2252,6 @@ void uvg_search_cu_intra(
search_data[0].pred_cu.mts_last_scan_pos = false;
search_data[0].pred_cu.violates_mts_coeff_constraint = false;
}
printf("%f\n", search_data[0].cost);
*mode_out = search_data[0];
}

View file

@ -1723,44 +1723,44 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
return diff;
}
void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int in_stride) {
static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
__m128i diff = _mm_setzero_si128();
switch (width) {
case 4:
diff = get_residual_4x1_avx2(ref_in + 0 * in_stride, pred_in + 0 * in_stride);
diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[0]), diff);
diff = get_residual_4x1_avx2(ref_in + 1 * in_stride, pred_in + 1 * in_stride);
diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[4]), diff);
diff = get_residual_4x1_avx2(ref_in + 2 * in_stride, pred_in + 2 * in_stride);
diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[8]), diff);
diff = get_residual_4x1_avx2(ref_in + 3 * in_stride, pred_in + 3 * in_stride);
diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[12]), diff);
break;
case 8:
diff = get_residual_8x1_avx2(&ref_in[0 * in_stride], &pred_in[0 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[0]), diff);
diff = get_residual_8x1_avx2(&ref_in[1 * in_stride], &pred_in[1 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[8]), diff);
diff = get_residual_8x1_avx2(&ref_in[2 * in_stride], &pred_in[2 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[16]), diff);
diff = get_residual_8x1_avx2(&ref_in[3 * in_stride], &pred_in[3 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[24]), diff);
diff = get_residual_8x1_avx2(&ref_in[4 * in_stride], &pred_in[4 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[32]), diff);
diff = get_residual_8x1_avx2(&ref_in[5 * in_stride], &pred_in[5 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[40]), diff);
diff = get_residual_8x1_avx2(&ref_in[6 * in_stride], &pred_in[6 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[48]), diff);
diff = get_residual_8x1_avx2(&ref_in[7 * in_stride], &pred_in[7 * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[56]), diff);
break;
default:
for (int y = 0; y < width; ++y) {
for (int x = 0; x < width; x += 16) {
diff = get_residual_8x1_avx2(&ref_in[x + y * in_stride], &pred_in[x + y * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]);
_mm_storeu_si128((__m128i*) & residual[x + y * width], diff);
diff = get_residual_8x1_avx2(&ref_in[(x + 8) + y * in_stride], &pred_in[(x + 8) + y * in_stride]);
diff = get_residual_8x1_avx2(&ref_in[(x + 8) + y * ref_stride], &pred_in[(x + 8) + y * pred_stride]);
_mm_storeu_si128((__m128i*) & residual[(x + 8) + y * width], diff);
}
}

View file

@ -623,7 +623,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
assert(width >= TR_MIN_WIDTH);
// Get residual. (ref_in - pred_in -> residual)
kvz_generate_residual(ref_in, pred_in, residual, width, in_stride);
kvz_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
int y, x;

View file

@ -782,12 +782,13 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)
}
void generate_residual_generic(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, int width, int in_stride)
static void generate_residual_generic(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual,
int width, int ref_stride, int pred_stride)
{
int y, x;
for (y = 0; y < width; ++y) {
for (x = 0; x < width; ++x) {
residual[x + y * width] = (int16_t)(ref_in[x + y * in_stride] - pred_in[x + y * in_stride]);
residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
}
}
}

View file

@ -207,8 +207,7 @@ int uvg_quant_cbcr_residual_generic(
) {
ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) int16_t u1_residual[2][TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) int16_t v1_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
{
@ -220,80 +219,64 @@ int uvg_quant_cbcr_residual_generic(
}
}
}
kvz_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride);
kvz_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride);
kvz_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
kvz_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
int best_cbf_mask = -1;
int64_t best_cost = INT64_MAX;
// This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM
for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) {
int64_t d1 = 0;
const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1);
const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
for (int y = 0; y < width; y++)
{
for (int x = 0; x < width; x++)
{
int cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
const int16_t cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
if (cbf_mask == 2)
{
u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1));
combined_residual[x + y * width] = (4 * cbx + 2 * crx) / 5;
}
else if (cbf_mask == -2)
{
u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1));
combined_residual[x + y * width] = (4 * cbx - 2 * crx) / 5;
}
else if (cbf_mask == 3)
{
u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2);
d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]);
combined_residual[x + y * width] = (cbx + crx) / 2;
}
else if (cbf_mask == -3)
{
u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2);
d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]);
combined_residual[x + y * width] = (cbx - crx) / 2;
}
else if (cbf_mask == 1)
{
v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5);
d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
combined_residual[x + y * width] = (4 * crx + 2 * cbx) / 5;
}
else if (cbf_mask == -1)
{
v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5);
d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
combined_residual[x + y * width] = (4 * crx - 2 * cbx) / 5;
}
else
{
d1 += square(cbx);
//d2 += square(crx);
assert(0);
}
}
}
if (d1 < best_cost) {
best_cbf_mask = i;
best_cost = d1;
}
}
uvg_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
if (state->encoder_control->cfg.rdoq_enable &&
(width > 4 || !state->encoder_control->cfg.rdoq_skip))
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf);
}
else if (state->encoder_control->cfg.rdoq_enable && false) {
uvg_ts_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
scan_order);
}
else {
uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
}
@ -309,13 +292,12 @@ int uvg_quant_cbcr_residual_generic(
}
if (has_coeffs && !early_skip) {
int y, x;
// Get quantized residual. (coeff_out -> coeff -> residual)
uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
uvg_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
//if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@ -336,39 +318,39 @@ int uvg_quant_cbcr_residual_generic(
// }
// }
//}
const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1);
const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
// Get quantized reconstruction. (residual + pred_in -> rec_out)
for (int y = 0; y < width; y++) {
for (int x = 0; x < width; x++) {
if (temp == 2) {
u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
u_residual[x + y * width] = combined_residual[x + y * width];
v_residual[x + y * width] = combined_residual[x + y * width] >> 1;
}
else if (temp == -2) {
u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
u_residual[x + y * width] = combined_residual[x + y * width];
v_residual[x + y * width] = -combined_residual[x + y * width] >> 1;
}
else if (temp == 3) {
u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
u_residual[x + y * width] = combined_residual[x + y * width];
v_residual[x + y * width] = combined_residual[x + y * width];
}
else if (temp == -3) {
// non-normative clipping to prevent 16-bit overflow
u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width];
u_residual[x + y * width] = combined_residual[x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
v_residual[x + y * width] = -combined_residual[x + y * width];
}
else if (temp == 1) {
u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
v_residual[x + y * width] = v1_residual[x + y * width];
u_residual[x + y * width] = combined_residual[x + y * width] >> 1;
v_residual[x + y * width] = combined_residual[x + y * width];
}
else if (temp == -1) {
u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
v_residual[x + y * width] = -v1_residual[x + y * width];
u_residual[x + y * width] = -combined_residual[x + y * width] >> 1;
v_residual[x + y * width] = combined_residual[x + y * width];
}
}
}
for (y = 0; y < width; ++y) {
for (x = 0; x < width; ++x) {
for (int y = 0; y < width; ++y) {
for (int x = 0; x < width; ++x) {
int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val);
int16_t v_val = v_residual[x + y * width] + v_pred_in[x + y * in_stride];
@ -379,20 +361,16 @@ int uvg_quant_cbcr_residual_generic(
else/* if (rec_out != pred_in)*/ {
// With no coeffs and rec_out == pred_int we skip copying the coefficients
// because the reconstruction is just the prediction.
int y, x;
for (y = 0; y < width; ++y) {
for (x = 0; x < width; ++x) {
for (int y = 0; y < width; ++y) {
for (int x = 0; x < width; ++x) {
u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
}
}
}
return has_coeffs ? best_cbf_mask : 0;
return has_coeffs ? cur_cu->joint_cb_cr : 0;
}
/**
@ -431,7 +409,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
const int height = width; // TODO: height for non-square blocks
// Get residual. (ref_in - pred_in -> residual)
kvz_generate_residual(ref_in, pred_in, residual, width, in_stride);
kvz_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
int y, x;

View file

@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);
typedef void (generate_residual_func)(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, int width, int in_stride);
typedef void (generate_residual_func)(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
// Declare function pointers.
extern reg_sad_func * uvg_reg_sad;
@ -229,6 +229,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
{"ver_sad", (void**) &uvg_ver_sad}, \
{"hor_sad", (void**) &uvg_hor_sad}, \
{"pixel_var", (void**) &uvg_pixel_var}, \
{"generate_residual", (void**) &kvz_generate_residual}, \

View file

@ -650,9 +650,8 @@ static void quantize_tr_residual(encoder_state_t * const state,
}
const bool can_use_trskip = tr_width <= (1 << state->encoder_control->cfg.trskip_max_size) &&
color == COLOR_Y &&
cfg->trskip_enable &&
cur_pu->tr_idx == 1;
cur_pu->tr_skip & (1 << color);
uint8_t has_coeffs;
@ -696,7 +695,6 @@ static void quantize_tr_residual(encoder_state_t * const state,
pred,
coeff,
lmcs_chroma_adj);
cur_pu->tr_skip = tr_skip;
} else {
if(color == COLOR_UV) {
has_coeffs = uvg_quant_cbcr_residual(