[mtt] Actually remove the last width dependency to depth

This commit is contained in:
Joose Sainio 2022-09-08 15:10:54 +03:00 committed by Marko Viitanen
parent dcf879e5ed
commit 6a0864839c
22 changed files with 360 additions and 347 deletions

View file

@ -47,12 +47,13 @@
#include "tables.h"
#include "videoframe.h"
bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu)
bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu, const cu_loc_t*
const cu_loc)
{
uint32_t ts_max_size = 1 << state->encoder_control->cfg.trskip_max_size;
const uint32_t max_size = 32; // CU::isIntra(cu) ? MTS_INTRA_MAX_CU_SIZE : MTS_INTER_MAX_CU_SIZE;
const uint32_t cu_width = LCU_WIDTH >> pred_cu->depth;
const uint32_t cu_height = LCU_WIDTH >> pred_cu->depth;
const uint32_t cu_width = cu_loc->width;
const uint32_t cu_height = cu_loc->height;
//bool mts_allowed = cu.chType == CHANNEL_TYPE_LUMA && compID == COMPONENT_Y;
uint8_t mts_type = state->encoder_control->cfg.mts;
@ -66,14 +67,16 @@ bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pr
return mts_allowed;
}
static void encode_mts_idx(encoder_state_t * const state,
static void encode_mts_idx(
encoder_state_t * const state,
cabac_data_t * const cabac,
const cu_info_t *const pred_cu)
const cu_info_t *const pred_cu,
const cu_loc_t* const cu_loc)
{
//TransformUnit &tu = *cu.firstTU;
int mts_idx = pred_cu->tr_idx;
if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu) && mts_idx != MTS_SKIP
if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu, cu_loc) && mts_idx != MTS_SKIP
&& !pred_cu->violates_mts_coeff_constraint
&& pred_cu->mts_last_scan_pos
)
@ -498,7 +501,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
static void encode_chroma_tu(
encoder_state_t* const state,
const cu_loc_t *cu_loc,
const cu_loc_t * const cu_loc,
int depth,
cu_info_t* cur_pu,
int8_t* scan_idx,
@ -541,8 +544,7 @@ static void encode_chroma_tu(
}
}
else {
// const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
const coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
@ -700,7 +702,7 @@ static void encode_transform_coeff(
}
*/
int8_t split = (LCU_WIDTH >> depth > TR_MAX_WIDTH);
int8_t split = (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH);
const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0;
const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0;
@ -1290,15 +1292,13 @@ bool uvg_write_split_flag(
const cu_info_t * left_cu,
const cu_info_t * above_cu,
uint8_t split_flag,
const cu_loc_t* const cu_loc,
int depth,
int cu_width,
int x,
int y,
enum uvg_tree_type tree_type,
double* bits_out)
{
uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
uint16_t abs_x = (cu_loc->x + state->tile->offset_x) >> (tree_type == UVG_CHROMA_T);
uint16_t abs_y = (cu_loc->y + state->tile->offset_y) >> (tree_type == UVG_CHROMA_T);
double bits = 0;
const encoder_control_t* const ctrl = state->encoder_control;
// Implisit split flag when on border
@ -1311,10 +1311,12 @@ bool uvg_write_split_flag(
// ToDo: update this when btt is actually used
bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
uint8_t implicit_split_mode = UVG_NO_SPLIT;
//bool implicit_split = border;
bool bottom_left_available = ((abs_y + cu_width - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
bool bottom_left_available = ((abs_y + cu_height - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T)));
if (!bottom_left_available && !top_right_available && allow_qt) {
@ -1349,11 +1351,11 @@ bool uvg_write_split_flag(
if (no_split && allow_split) {
// Get left and top block split_flags and if they are present and true, increase model number
// ToDo: should use height and width to increase model, PU_GET_W() ?
if (left_cu && LCU_WIDTH >> left_cu->depth < LCU_WIDTH >> depth) {
if (left_cu && left_cu->depth > depth) {
split_model++;
}
if (above_cu && LCU_WIDTH >> above_cu->depth < LCU_WIDTH >> depth) {
if (above_cu && above_cu->depth > depth) {
split_model++;
}
@ -1457,7 +1459,16 @@ void uvg_encode_coding_tree(
// When not in MAX_DEPTH, insert split flag and split the blocks if needed
if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) {
const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7, depth, cu_width, x, y, tree_type,NULL);
const int split_flag = uvg_write_split_flag(
state,
cabac,
left_cu,
above_cu,
(cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7,
cu_loc,
depth,
tree_type,
NULL);
if (split_flag || border) {
const int half_luma = cu_loc->width / 2;
@ -1597,8 +1608,8 @@ void uvg_encode_coding_tree(
uvg_pixel *rec_base_v = &frame->rec->v[x / 2 + y / 2 * ctrl->in.width / 2];
// Luma
for (unsigned y_px = 0; y_px < LCU_WIDTH >> depth; y_px++) {
for (unsigned x_px = 0; x_px < LCU_WIDTH >> depth; x_px++) {
for (unsigned y_px = 0; y_px < cu_height; y_px++) {
for (unsigned x_px = 0; x_px < cu_width; x_px++) {
uvg_bitstream_put(cabac->stream, base_y[x_px + y_px * ctrl->in.width], 8);
rec_base_y[x_px + y_px * ctrl->in.width] = base_y[x_px + y_px * ctrl->in.width];
}
@ -1606,14 +1617,14 @@ void uvg_encode_coding_tree(
// Chroma
if (ctrl->chroma_format != UVG_CSP_400) {
for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
uvg_bitstream_put(cabac->stream, base_u[x_px + y_px * (ctrl->in.width >> 1)], 8);
rec_base_u[x_px + y_px * (ctrl->in.width >> 1)] = base_u[x_px + y_px * (ctrl->in.width >> 1)];
}
}
for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
uvg_bitstream_put(cabac->stream, base_v[x_px + y_px * (ctrl->in.width >> 1)], 8);
rec_base_v[x_px + y_px * (ctrl->in.width >> 1)] = base_v[x_px + y_px * (ctrl->in.width >> 1)];
}
@ -1664,7 +1675,7 @@ void uvg_encode_coding_tree(
encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
}
encode_mts_idx(state, cabac, cur_cu);
encode_mts_idx(state, cabac, cur_cu, cu_loc);
}
} else if (cur_cu->type == CU_INTRA) {
@ -1701,7 +1712,7 @@ void uvg_encode_coding_tree(
if (tree_type != UVG_CHROMA_T) {
bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, x, y, depth, cu_width, cu_height, tree_type, COLOR_Y);
}
encode_mts_idx(state, cabac, cur_cu);
encode_mts_idx(state, cabac, cur_cu, cu_loc);
// For 4x4 the chroma PU/TU is coded after the last
if (state->encoder_control->chroma_format != UVG_CSP_400 &&
@ -1731,7 +1742,7 @@ void uvg_encode_coding_tree(
end:
if (is_last_cu_in_qg(state, x, y, depth)) {
if (is_last_cu_in_qg(state, cu_loc)) {
state->last_qp = cur_cu->qp;
}
@ -1752,10 +1763,8 @@ double uvg_mock_encode_coding_unit(
const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
const int cu_width = LCU_WIDTH >> depth;
int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
const cu_info_t* left_cu = NULL, *above_cu = NULL;
if (x) {
@ -1787,16 +1796,14 @@ double uvg_mock_encode_coding_unit(
left_cu,
above_cu,
0,
cu_loc,
depth,
cu_width >> (tree_type == UVG_CHROMA_T),
x >> (tree_type == UVG_CHROMA_T),
y >> (tree_type == UVG_CHROMA_T),
tree_type,
&bits);
}
// Encode skip flag
if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
int8_t ctx_skip = 0;
if (left_cu && left_cu->skipped) {
@ -1829,7 +1836,7 @@ double uvg_mock_encode_coding_unit(
}
}
// Prediction mode
if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
int8_t ctx_predmode = 0;

View file

@ -40,7 +40,8 @@
#include "encoderstate.h"
#include "global.h"
bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu);
bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t*
const cu_loc);
bool uvg_is_lfnst_allowed(
const encoder_state_t* const state,
const cu_info_t* const pred_cu,
@ -105,10 +106,8 @@ bool uvg_write_split_flag(
const cu_info_t* left_cu,
const cu_info_t* above_cu,
uint8_t split_flag,
const cu_loc_t* const cu_loc,
int depth,
int cu_width,
int x,
int y,
enum uvg_tree_type tree_type,
double* bits_out);

View file

@ -627,36 +627,45 @@ static void encode_sao(encoder_state_t * const state,
* \param prev_qp -1 if QP delta has not been coded in current QG,
* otherwise the QP of the current QG
*/
static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const
int depth)
{
// Stop recursion if the CU is completely outside the frame.
if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;
cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
const int cu_width = LCU_WIDTH >> depth;
cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
const int width = LCU_WIDTH >> cu->depth;
if (depth <= state->frame->max_qp_delta_depth) {
*prev_qp = -1;
}
if (cu->depth > depth) {
if (cu_loc->width > width) {
// Recursively process sub-CUs.
const int d = cu_width >> 1;
set_cu_qps(state, x, y, depth + 1, last_qp, prev_qp);
set_cu_qps(state, x + d, y, depth + 1, last_qp, prev_qp);
set_cu_qps(state, x, y + d, depth + 1, last_qp, prev_qp);
set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
const int half_width = cu_loc->width >> 1;
const int half_height = cu_loc->height >> 1;
cu_loc_t split_cu_loc;
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
} else {
bool cbf_found = *prev_qp >= 0;
int y_limit = cu_loc->y + cu_loc->height;
int x_limit = cu_loc->x + cu_loc->width;
if (cu->tr_depth > depth) {
// The CU is split into smaller transform units. Check whether coded
// block flag is set for any of the TUs.
const int tu_width = LCU_WIDTH >> cu->tr_depth;
for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
if (cbf_is_set_any(tu->cbf, cu->depth)) {
cbf_found = true;
@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
if (cbf_found) {
*prev_qp = qp = cu->qp;
} else {
qp = uvg_get_cu_ref_qp(state, x, y, *last_qp);
qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp);
}
// Set the correct QP for all state->tile->frame->cu_array elements in
// the area covered by the CU.
for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) {
for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) {
uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
}
}
if (is_last_cu_in_qg(state, x, y, depth)) {
if (is_last_cu_in_qg(state, cu_loc)) {
*last_qp = cu->qp;
}
}
@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
if (state->frame->max_qp_delta_depth >= 0) {
int last_qp = state->last_qp;
int prev_qp = -1;
set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
cu_loc_t cu_loc;
uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH);
set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0);
}
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) {

View file

@ -401,14 +401,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
* \param depth depth in the CU tree
* \return true, if it's the last CU in its QG, otherwise false
*/
static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc)
{
if (state->frame->max_qp_delta_depth < 0) return false;
const int cu_width = LCU_WIDTH >> depth;
const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
const int right = x + cu_width;
const int bottom = y + cu_width;
const int right = cu_loc->x + cu_loc->width;
const int bottom = cu_loc->y + cu_loc->height;
return (right % qg_width == 0 || right >= state->tile->frame->width) &&
(bottom % qg_width == 0 || bottom >= state->tile->frame->height);
}

View file

@ -856,8 +856,7 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
uint8_t max_filter_length_Q = 0;
const int cu_size = LCU_WIDTH >> cu_q->depth;
// TODO: NON square
const int pu_size = dir == EDGE_HOR ? cu_size
: cu_size;
const int pu_size = dir == EDGE_HOR ? cu_size : cu_size;
const int pu_pos = dir == EDGE_HOR ? y_coord
: x_coord;
get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,

View file

@ -273,7 +273,6 @@ typedef int32_t mv_t;
#define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
#define CLIP_TO_QP(value) CLIP(0, 51, (value))
#define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
#define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
#define CEILDIV(x,y) (((x) + (y) - 1) / (y))

View file

@ -1555,7 +1555,7 @@ void uvg_intra_predict(
uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
predict_cclm(
state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst,
state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst,
(cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
tree_type);
}

View file

@ -297,7 +297,7 @@ out:
static INLINE double get_coeff_cabac_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
cu_loc_t *cu_loc,
const cu_loc_t* const cu_loc,
color_t color,
int8_t scan_mode,
int8_t tr_skip,
@ -415,7 +415,7 @@ double uvg_get_coeff_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
cu_info_t* cur_tu,
cu_loc_t *cu_loc,
const cu_loc_t* const cu_loc,
color_t color,
int8_t scan_mode,
int8_t tr_skip,
@ -1409,7 +1409,6 @@ void uvg_rdoq(
int8_t color,
int8_t scan_mode,
int8_t block_type,
int8_t tr_depth,
uint16_t cbf,
uint8_t lfnst_idx)
{

View file

@ -60,7 +60,6 @@ void uvg_rdoq(
int8_t type,
int8_t scan_mode,
int8_t block_type,
int8_t tr_depth,
uint16_t cbf,
uint8_t lfnst_idx);
@ -73,7 +72,7 @@ double uvg_get_coeff_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
cu_info_t* cur_tu,
cu_loc_t *cu_loc,
const cu_loc_t* const cu_loc,
color_t color,
int8_t scan_mode,
int8_t tr_skip,

View file

@ -63,30 +63,39 @@
static const int INTRA_THRESHOLD = 8;
static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu_loc, enum uvg_tree_type
tree_type)
{
for (int y = y_local; y < y_local + width; y += SCU_WIDTH) {
for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
const int y_limit = (cu_loc->local_y + cu_loc->height) >> (tree_type == UVG_CHROMA_T);
const int x_limit = (cu_loc->local_x + cu_loc->width) >> (tree_type == UVG_CHROMA_T);
for (int y = cu_loc->local_y >> (tree_type == UVG_CHROMA_T); y < y_limit; y += SCU_WIDTH) {
for (int x = cu_loc->local_x >> (tree_type == UVG_CHROMA_T); x < x_limit; x += SCU_WIDTH) {
*LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
}
}
}
static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, enum uvg_tree_type
static INLINE void copy_cu_pixels(
lcu_t *from,
lcu_t *to,
const cu_loc_t* const cu_loc,
enum uvg_tree_type
tree_type)
{
const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
const int luma_index = x_local + y_local * LCU_WIDTH;
const int chroma_index = tree_type == UVG_CHROMA_T ? x_local + y_local * LCU_WIDTH_C : (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;
if(tree_type != UVG_CHROMA_T) {
uvg_pixels_blit(&from->rec.y[luma_index], &to->rec.y[luma_index],
width, width, LCU_WIDTH, LCU_WIDTH);
cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH);
}
if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
uvg_pixels_blit(&from->rec.u[chroma_index], &to->rec.u[chroma_index],
width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
uvg_pixels_blit(&from->rec.v[chroma_index], &to->rec.v[chroma_index],
width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
}
}
@ -103,8 +112,8 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
//const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
const int chroma_x = cu_loc->x >> (tree_type != UVG_CHROMA_T);
const int chroma_y = cu_loc->y >> (tree_type != UVG_CHROMA_T);
const int chroma_x = (cu_loc->x >> 1) & ~3;
const int chroma_y = (cu_loc->y >> 1) & ~3;
const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C);
copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
@ -118,15 +127,17 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
/**
* Copy all non-reference CU data from next level to current level.
*/
static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint, enum
uvg_tree_type tree_type)
static void work_tree_copy_up(
lcu_t *work_tree,
bool joint,
enum
uvg_tree_type tree_type,
const cu_loc_t* const cu_loc,
const int depth)
{
const int width = LCU_WIDTH >> depth;
cu_loc_t loc;
uvg_cu_loc_ctor(&loc, x_local, y_local, width, width);
copy_cu_info (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
copy_cu_info (&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
copy_cu_pixels(&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
copy_cu_coeffs(cu_loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
}
@ -134,24 +145,32 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t
/**
* Copy all non-reference CU data from current level to all lower levels.
*/
static void work_tree_copy_down(int x_local, int y_local, int depth, lcu_t *work_tree, enum uvg_tree_type
tree_type)
static void work_tree_copy_down(
int depth,
lcu_t *work_tree,
enum uvg_tree_type
tree_type,
const cu_loc_t* const cu_loc)
{
const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> 1;
for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) {
copy_cu_info (x_local, y_local, width, &work_tree[depth], &work_tree[i]);
copy_cu_pixels(x_local, y_local, LCU_WIDTH >> depth, &work_tree[depth], &work_tree[i], tree_type);
copy_cu_info (&work_tree[depth], &work_tree[i], cu_loc, tree_type);
copy_cu_pixels(&work_tree[depth], &work_tree[i], cu_loc, tree_type);
}
}
void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
void uvg_lcu_fill_trdepth(
lcu_t *lcu,
const cu_loc_t* const cu_loc,
uint8_t tr_depth,
enum uvg_tree_type
tree_type)
{
const int x_local = SUB_SCU(x_px);
const int y_local = SUB_SCU(y_px);
const unsigned width = (tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C) >> depth;
const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
const unsigned width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
const unsigned height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
for (unsigned y = 0; y < width; y += SCU_WIDTH) {
for (unsigned y = 0; y < height; y += SCU_WIDTH) {
for (unsigned x = 0; x < width; x += SCU_WIDTH) {
LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth;
}
@ -167,6 +186,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
to->type = cu->type;
to->depth = cu->depth;
to->qp = cu->qp;
to->split_tree = cu->split_tree;
//to->tr_idx = cu->tr_idx;
to->lfnst_idx = cu->lfnst_idx;
to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos;
@ -214,34 +234,37 @@ static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned wid
//Calculates cost for all zero coeffs
static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y,
static double cu_zero_coeff_cost(
const encoder_state_t *state,
lcu_t *work_tree,
const cu_loc_t* const cu_loc,
const int depth)
{
int x_local = SUB_SCU(x);
int y_local = SUB_SCU(y);
int cu_width = LCU_WIDTH >> depth;
lcu_t *const lcu = &work_tree[depth];
const int y_local = cu_loc->local_y;
const int x_local = cu_loc->local_x;
const int luma_index = y_local * LCU_WIDTH + x_local;
const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
double ssd = 0.0;
ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
&lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
LCU_WIDTH, LCU_WIDTH, cu_width
LCU_WIDTH, LCU_WIDTH, cu_loc->width
);
if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
&lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
);
ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
&lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
);
}
// Save the pixels at a lower level of the working tree.
copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1], UVG_BOTH_T);
copy_cu_pixels(lcu, &work_tree[depth + 1], cu_loc, UVG_BOTH_T);
return ssd;
}
@ -295,46 +318,45 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
* Takes into account SSD of reconstruction and the cost of encoding whatever
* prediction unit data needs to be coded.
*/
double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
const int x_px, const int y_px, const int depth,
double uvg_cu_rd_cost_luma(
const encoder_state_t *const state,
const cu_loc_t* const cu_loc,
const cu_info_t *const pred_cu,
lcu_t *const lcu,
uint8_t isp_cbf)
{
const int width = LCU_WIDTH >> depth;
const int height = width; // TODO: height for non-square blocks
const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
cu_loc_t loc;
uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
// cur_cu is used for TU parameters.
cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
double coeff_bits = 0;
double tr_tree_bits = 0;
// Check that lcu is not in
assert(x_px >= 0 && x_px < LCU_WIDTH);
assert(y_px >= 0 && y_px < LCU_WIDTH);
const uint8_t tr_depth = tr_cu->tr_depth - depth;
if (tr_depth > 0) {
int offset = width / 2;
if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
double sum = 0;
const int half_width = cu_loc->width >> 1;
const int half_height = cu_loc->height >> 1;
cu_loc_t split_cu_loc;
sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, isp_cbf);
sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, isp_cbf);
sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y+ half_height, half_width, half_height);
sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
return sum + tr_tree_bits * state->lambda;
}
// Add transform_tree cbf_luma bit cost.
if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
if (pred_cu->type == CU_INTRA ||
@ -347,7 +369,9 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
}
if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
if (is_set && state->encoder_control->cfg.trskip_enable
&& cu_loc->width <= (1 << state->encoder_control->cfg.trskip_max_size)
&& cu_loc->height <= (1 << state->encoder_control->cfg.trskip_max_size)) {
CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
}
}
@ -367,28 +391,28 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
// SSD between reconstruction and original
int ssd = 0;
if (!state->encoder_control->cfg.lossless) {
int index = y_px * LCU_WIDTH + x_px;
int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
LCU_WIDTH, LCU_WIDTH,
width);
cu_loc->width);
}
if (!skip_residual_coding) {
int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
int8_t luma_scan_mode = SCAN_DIAG;
if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
//const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
const coeff_t* coeffs = lcu->coeff.y;
coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
}
else {
int split_type = pred_cu->intra.isp_mode;
int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, split_type, true);
for (int i = 0; i < split_limit; ++i) {
cu_loc_t split_loc;
uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height, i, split_type, true);
const int part_x = split_loc.x;
const int part_y = split_loc.y;
@ -406,34 +430,32 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
}
double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
const int x_px, const int y_px, const int depth,
double uvg_cu_rd_cost_chroma(
const encoder_state_t *const state,
cu_info_t *const pred_cu,
lcu_t *const lcu)
lcu_t *const lcu,
const cu_loc_t * const cu_loc)
{
const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
const int height = width; // TODO: height for non-square blocks
cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
const vector2d_t lcu_px = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
const int width = cu_loc->chroma_width;
const int height = cu_loc->chroma_height;
cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
cu_loc_t loc;
uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
double tr_tree_bits = 0;
double coeff_bits = 0;
assert(x_px >= 0 && x_px < LCU_WIDTH);
assert(y_px >= 0 && y_px < LCU_WIDTH);
if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) {
if (cu_loc->width == 4 && cu_loc->height == 4 && (cu_loc->x % 8 == 0 || cu_loc->y % 8 == 0)) {
// For MAX_PU_DEPTH calculate chroma for previous depth for the first
// block and return 0 cost for all others.
return 0;
}
const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U);
int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V);
// See luma for why the second condition
if (!skip_residual_coding) {
const int tr_depth = depth - pred_cu->depth;
@ -450,14 +472,21 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
}
if (tr_cu->tr_depth > depth) {
int offset = LCU_WIDTH >> (depth + 1);
if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
double sum = 0;
// Recursively process sub-CUs.
const int half_width = cu_loc->width >> 1;
const int half_height = cu_loc->height >> 1;
cu_loc_t split_cu_loc;
sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
sum += uvg_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
return sum + tr_tree_bits * state->lambda;
}
@ -487,14 +516,17 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
if (!skip_residual_coding) {
int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
//const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
// We need the rounded & shifted coordinates for the chroma coeff calculation
cu_loc_t chroma_loc;
uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, cu_loc->width, cu_loc->height);
if((pred_cu->joint_cb_cr & 3) == 0){
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
}
else {
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
}
}
@ -507,31 +539,22 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
static double cu_rd_cost_tr_split_accurate(
const encoder_state_t* const state,
const int x_px,
const int y_px,
const int depth,
const cu_info_t* const pred_cu,
lcu_t* const lcu,
enum uvg_tree_type tree_type,
uint8_t isp_cbf) {
const int width = LCU_WIDTH >> depth;
const int height = width; // TODO: height for non-square blocks
cu_loc_t loc;
uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
uint8_t isp_cbf,
const cu_loc_t* const cu_loc) {
const int width = cu_loc->width;
const int height = cu_loc->height; // TODO: height for non-square blocks
const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
// cur_cu is used for TU parameters.
cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T));
double coeff_bits = 0;
double tr_tree_bits = 0;
// Check that lcu is not in
assert(x_px >= 0 && x_px < LCU_WIDTH);
assert(y_px >= 0 && y_px < LCU_WIDTH);
const uint8_t tr_depth = tr_cu->tr_depth - depth;
const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);
@ -539,7 +562,7 @@ static double cu_rd_cost_tr_split_accurate(
cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
{
int cbf = cbf_is_set_any(pred_cu->cbf, depth);
int cbf = cbf_is_set_any(tr_cu->cbf, depth);
// Only need to signal coded block flag if not skipped or merged
// skip = no coded residual, merge = coded residual
if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
@ -548,24 +571,30 @@ static double cu_rd_cost_tr_split_accurate(
}
bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8)) && tree_type != UVG_LUMA_T;
bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (cu_loc->x % 8 && cu_loc->y % 8)) && tree_type != UVG_LUMA_T;
if( !skip_residual_coding && has_chroma) {
if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
if(tr_cu->tr_depth == depth) {
CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
}
if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
if(tr_cu->tr_depth == depth) {
CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
}
}
if (tr_depth > 0) {
int offset = LCU_WIDTH >> (depth + 1);
if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
double sum = 0;
sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
const int half_width = cu_loc->width >> 1;
const int half_height = cu_loc->height >> 1;
cu_loc_t split_cu_loc;
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
return sum + tr_tree_bits * state->lambda;
}
const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
@ -573,7 +602,7 @@ static double cu_rd_cost_tr_split_accurate(
const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
// Add transform_tree cbf_luma bit cost.
if (!is_isp) {
const int is_tr_split = depth - tr_cu->depth;
const int is_tr_split = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
if ((pred_cu->type == CU_INTRA ||
is_tr_split ||
cb_flag_u ||
@ -610,7 +639,7 @@ static double cu_rd_cost_tr_split_accurate(
// SSD between reconstruction and original
unsigned luma_ssd = 0;
if (!state->encoder_control->cfg.lossless && tree_type != UVG_CHROMA_T) {
int index = y_px * LCU_WIDTH + x_px;
int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
LCU_WIDTH, LCU_WIDTH,
width);
@ -623,12 +652,12 @@ static double cu_rd_cost_tr_split_accurate(
if (can_use_tr_skip) {
CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
}
int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
int8_t luma_scan_mode = SCAN_DIAG;
if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
//const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
const coeff_t* coeffs = lcu->coeff.y;
coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
}
else {
int split_type = pred_cu->intra.isp_mode;
@ -636,7 +665,7 @@ static double cu_rd_cost_tr_split_accurate(
for (int i = 0; i < split_limit; ++i) {
cu_loc_t split_loc;
uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
const int part_x = split_loc.x;
const int part_y = split_loc.y;
@ -649,8 +678,8 @@ static double cu_rd_cost_tr_split_accurate(
}
}
if(depth == 4 || tree_type == UVG_LUMA_T) {
if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, COLOR_Y, lcu)) {
if(cu_loc->width == 4 || tree_type == UVG_LUMA_T) {
if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, COLOR_Y, lcu)) {
const int lfnst_idx = tr_cu->lfnst_idx;
CABAC_FBITS_UPDATE(
cabac,
@ -672,14 +701,17 @@ static double cu_rd_cost_tr_split_accurate(
unsigned chroma_ssd = 0;
if(has_chroma) {
const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3 };
uvg_cu_loc_ctor(&loc, lcu_px.x, lcu_px.y, width, height);
const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
const int chroma_height = chroma_width; // TODO: height for non-square blocks
int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
cu_loc_t chroma_loc;
const vector2d_t lcu_px = { (cu_loc->local_x >> 1) & ~3, (cu_loc->local_y >> 1) &~3 };
uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, width, height);
const int chroma_width = cu_loc->chroma_width;
const int chroma_height = cu_loc->chroma_height; // TODO: height for non-square blocks
int8_t scan_order = SCAN_DIAG;
//const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size);
const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable
&& chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size)
&& chroma_height <= (1 << state->encoder_control->cfg.trskip_max_size);
if(pred_cu->joint_cb_cr == 0) {
if (!state->encoder_control->cfg.lossless) {
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
@ -697,8 +729,8 @@ static double cu_rd_cost_tr_split_accurate(
if(chroma_can_use_tr_skip && cb_flag_v) {
CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");
}
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
}
else {
@ -715,12 +747,12 @@ static double cu_rd_cost_tr_split_accurate(
if (chroma_can_use_tr_skip) {
CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
}
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
}
}
if (uvg_is_lfnst_allowed(state, tr_cu, width, height, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
const int lfnst_idx = (depth != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, cu_loc->width == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
const int lfnst_idx = (cu_loc->width != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
CABAC_FBITS_UPDATE(
cabac,
&cabac->ctx.lfnst_idx_model[tr_cu->depth == 4 || tree_type != UVG_BOTH_T],
@ -739,7 +771,7 @@ static double cu_rd_cost_tr_split_accurate(
tr_cu->lfnst_last_scan_pos = false;
tr_cu->violates_lfnst_constrained_luma = false;
tr_cu->violates_lfnst_constrained_chroma = false;
if (uvg_is_mts_allowed(state, tr_cu) && tree_type != UVG_CHROMA_T) {
if (uvg_is_mts_allowed(state, tr_cu, cu_loc) && tree_type != UVG_CHROMA_T) {
bool symbol = tr_cu->tr_idx != 0;
int ctx_idx = 0;
@ -1035,10 +1067,6 @@ static double search_cu(
if ((split_tree.current_depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
intra_search.pred_cu.joint_cb_cr = 0;
// There is almost no benefit to doing the chroma mode search for
// rd2. Possibly because the luma mode search already takes chroma
// into account, so there is less of a chanse of luma mode being
// really bad for chroma.
if(tree_type == UVG_CHROMA_T) {
intra_search.pred_cu.intra = uvg_get_co_located_luma_cu(x, y, luma_width, luma_width, NULL, state->tile->frame->cu_array, UVG_CHROMA_T)->intra;
intra_mode = intra_search.pred_cu.intra.mode;
@ -1046,7 +1074,7 @@ static double search_cu(
}
intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search, tree_type);
uvg_search_cu_intra_chroma(state, cu_loc, lcu, &intra_search, tree_type);
if (intra_search.pred_cu.joint_cb_cr == 0) {
intra_search.pred_cu.joint_cb_cr = 4;
@ -1066,7 +1094,7 @@ static double search_cu(
false,
true);
if(tree_type != UVG_CHROMA_T) {
intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu);
intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, cu_loc);
}
else {
intra_cost = intra_search.cost;
@ -1080,7 +1108,7 @@ static double search_cu(
}
intra_search.pred_cu.intra.mode = intra_mode;
if(tree_type == UVG_CHROMA_T) {
uvg_lcu_fill_trdepth(lcu, x_local, y_local, split_tree.current_depth, split_tree.current_depth, tree_type);
uvg_lcu_fill_trdepth(lcu, cu_loc, split_tree.current_depth, tree_type);
}
}
if (intra_cost < cost) {
@ -1187,14 +1215,14 @@ static double search_cu(
// This will no longer be necessary if the transform depths are not shared.
int tr_depth = MAX(1, split_tree.current_depth);
uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type);
uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, tree_type);
const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);
if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
//Calculate cost for zero coeffs
inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, split_tree.current_depth) + inter_bitcost * state->lambda;
inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;
}
cu_loc_t loc;
@ -1239,13 +1267,13 @@ static double search_cu(
cost = bits * state->lambda;
cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
cost = inter_zero_coeff_cost;
// Restore saved pixels from lower level of the working tree.
copy_cu_pixels(x_local, y_local, cu_width, &work_tree[split_tree.current_depth + 1], lcu, tree_type);
copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type);
if (cur_cu->merged) {
cur_cu->merged = 0;
@ -1256,7 +1284,7 @@ static double search_cu(
if (cur_cu->tr_depth != 0) {
// Reset transform depth since there are no coefficients. This
// ensures that CBF is cleared for the whole area of the CU.
uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type);
uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
}
cur_cu->cbf = 0;
@ -1317,10 +1345,8 @@ static double search_cu(
left_cu,
above_cu,
1,
cu_loc,
depth,
cu_width,
x >> (tree_type == UVG_CHROMA_T),
y >> (tree_type == UVG_CHROMA_T),
tree_type,
&split_bits);
}
@ -1380,8 +1406,7 @@ static double search_cu(
uvg_write_split_flag(state, &state->search_cabac,
x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
0, depth, cu_width, x, y, tree_type,
&bits);
0, cu_loc, depth, tree_type, &bits);
cur_cu->intra = cu_d1->intra;
cur_cu->type = CU_INTRA;
@ -1391,7 +1416,7 @@ static double search_cu(
cur_cu->lfnst_idx = 0;
cur_cu->cr_lfnst_idx = 0;
uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth, tree_type);
uvg_lcu_fill_trdepth(lcu, cu_loc, cur_cu->tr_depth, tree_type);
lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
intra_search_data_t proxy;
@ -1404,12 +1429,12 @@ static double search_cu(
lcu,
tree_type,
true,
state->encoder_control->chroma_format == UVG_CSP_400);
state->encoder_control->chroma_format != UVG_CSP_400);
double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits;
cost += mode_bits * state->lambda;
cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
@ -1419,7 +1444,7 @@ static double search_cu(
if (split_cost < cost) {
// Copy split modes to this depth.
cost = split_cost;
work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr, tree_type);
work_tree_copy_up(work_tree, state->encoder_control->cfg.jccr, tree_type, cu_loc, depth);
#if UVG_DEBUG
//debug_split = 1;
#endif
@ -1427,7 +1452,7 @@ static double search_cu(
// Copy this CU's mode all the way down for use in adjacent CUs mode
// search.
memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
downsample_cclm_rec(
state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
);
@ -1454,7 +1479,7 @@ static double search_cu(
} else if (depth >= 0 && depth < MAX_PU_DEPTH) {
// Need to copy modes down since the lower level of the work tree is used
// when searching SMP and AMP blocks.
work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
if(tree_type != UVG_CHROMA_T) {
downsample_cclm_rec(
state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]

View file

@ -84,17 +84,23 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);
void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);
double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
const int x_px, const int y_px, const int depth,
double uvg_cu_rd_cost_luma(
const encoder_state_t *const state,
const cu_loc_t* const cu_loc,
const cu_info_t *const pred_cu,
lcu_t *const lcu,
uint8_t isp_cbf);
double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
const int x_px, const int y_px, const int depth,
double uvg_cu_rd_cost_chroma(
const encoder_state_t *const state,
cu_info_t *const pred_cu,
lcu_t *const lcu);
lcu_t *const lcu,
const cu_loc_t * const);
void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
void uvg_lcu_fill_trdepth(
lcu_t *lcu,
const cu_loc_t* const cu_loc,
uint8_t tr_depth,
enum uvg_tree_type
tree_type);
void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);

View file

@ -1811,7 +1811,7 @@ static void search_pu_inter(
cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1];
cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0];
cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1];
uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T);
uvg_lcu_fill_trdepth(lcu, cu_loc, MAX(1, depth), UVG_BOTH_T);
uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
@ -2129,12 +2129,12 @@ void uvg_cu_cost_inter_rd2(
const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
int tr_depth = MAX(1, depth);
uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, tr_depth, UVG_BOTH_T);
uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, UVG_BOTH_T);
const int x_px = SUB_SCU(cu_loc->x);
const int y_px = SUB_SCU(cu_loc->y);
const int width = LCU_WIDTH >> depth;
const int height = width; // TODO: non-square blocks
const int width = cu_loc->width;
const int height = cu_loc->height;
cabac_data_t cabac_copy;
memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
@ -2155,10 +2155,10 @@ void uvg_cu_cost_inter_rd2(
int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C,
width / 2);
cu_loc->chroma_width);
double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C,
width / 2);
cu_loc->chroma_width);
ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
}
double no_cbf_bits;
@ -2217,12 +2217,10 @@ void uvg_cu_cost_inter_rd2(
uvg_chorma_ts_out_t chorma_ts_out;
uvg_chroma_transform_search(
state,
depth,
lcu,
&cabac_copy,
cu_loc,
index,
0,
cur_cu,
u_pred,
v_pred,
@ -2262,10 +2260,10 @@ void uvg_cu_cost_inter_rd2(
int cbf = cbf_is_set_any(cur_cu->cbf, depth);
if(cbf) {
*inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, 0);
*inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
if (reconstruct_chroma) {
if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
*inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
*inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
}
else {
*inter_cost += chroma_cost;

View file

@ -431,9 +431,7 @@ static double search_intra_trdepth(
}
double rd_cost = uvg_cu_rd_cost_luma(
state,
lcu_px.x,
lcu_px.y,
depth,
cu_loc,
pred_cu,
lcu,
search_data->best_isp_cbfs);
@ -502,11 +500,9 @@ static double search_intra_trdepth(
);
best_rd_cost += uvg_cu_rd_cost_chroma(
state,
lcu_px.x,
lcu_px.y,
depth,
pred_cu,
lcu);
lcu,
cu_loc);
pred_cu->intra.mode = luma_mode;
// Check lfnst constraints for chroma
@ -552,7 +548,7 @@ static double search_intra_trdepth(
UVG_BOTH_T,
false,
true);
best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
best_rd_cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
pred_cu->intra.mode = luma_mode;
}
pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@ -655,7 +651,7 @@ static double search_intra_trdepth(
if (depth == 0 || split_cost < nosplit_cost) {
return split_cost;
} else {
uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
pred_cu->cbf = nosplit_cbf;
@ -690,19 +686,15 @@ static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length)
static int search_intra_chroma_rough(
encoder_state_t * const state,
int x_px,
int y_px,
int depth,
const vector2d_t* const lcu_px,
intra_search_data_t* chroma_data,
lcu_t* lcu,
int8_t luma_mode,
enum uvg_tree_type tree_type)
enum uvg_tree_type tree_type,
const cu_loc_t* const cu_loc)
{
assert(depth != 4 || (x_px & 4 && y_px & 4));
const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
const int_fast8_t log2_width_c = uvg_g_convert_to_log2[cu_loc->chroma_width];
const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
const int width = 1 << log2_width_c;
const int height = width; // TODO: height for non-square blocks
@ -714,7 +706,7 @@ static int search_intra_chroma_rough(
uvg_intra_references refs_v;
uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);
vector2d_t lcu_cpx = { (lcu_px->x & ~7) / 2, (lcu_px->y & ~7) / 2 };
vector2d_t lcu_cpx = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
@ -1494,29 +1486,19 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
int8_t uvg_search_intra_chroma_rdo(
encoder_state_t * const state,
int x_px,
int y_px,
int depth,
int8_t num_modes,
lcu_t *const lcu,
intra_search_data_t* chroma_data,
int8_t luma_mode,
enum uvg_tree_type tree_type)
enum uvg_tree_type tree_type,
const cu_loc_t* const cu_loc)
{
const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
const bool reconstruct_chroma = true;
const int luma_width = LCU_WIDTH >> depth;
const int luma_height = LCU_WIDTH >> depth; // TODO: height
int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
cu_loc_t loc;
uvg_cu_loc_ctor(&loc, x_px & ~7, y_px & ~7, luma_width, luma_height);
const int chroma_width = loc.chroma_width;
const int chroma_height = loc.chroma_height;
const int chroma_width = cu_loc->chroma_width;
const int chroma_height = cu_loc->chroma_height;
uvg_intra_references refs[2];
const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
const vector2d_t pic_px = {
state->tile->frame->width,
state->tile->frame->height,
@ -1524,17 +1506,17 @@ int8_t uvg_search_intra_chroma_rdo(
if (reconstruct_chroma) {
uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
uvg_intra_build_reference(cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
uvg_intra_build_reference(cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
cabac_data_t temp_cabac;
memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
const int offset = ((cu_loc->local_x & ~7) >> 1) + ((cu_loc->local_y & ~7) >> 1)* LCU_WIDTH_C;
int lfnst_modes_to_check[3];
if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
if((cu_loc->width == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
for (int i = 0; i < 3; ++i) {
lfnst_modes_to_check[i] = i;
}
@ -1572,7 +1554,7 @@ int8_t uvg_search_intra_chroma_rdo(
uvg_intra_predict(
state,
&refs[COLOR_U - 1],
&loc,
cu_loc,
COLOR_U,
u_pred,
&chroma_data[mode_i],
@ -1581,7 +1563,7 @@ int8_t uvg_search_intra_chroma_rdo(
uvg_intra_predict(
state,
&refs[COLOR_V - 1],
&loc,
cu_loc,
COLOR_V,
v_pred,
&chroma_data[mode_i],
@ -1606,12 +1588,10 @@ int8_t uvg_search_intra_chroma_rdo(
uvg_chorma_ts_out_t chorma_ts_out;
uvg_chroma_transform_search(
state,
depth,
lcu,
&temp_cabac,
&loc,
cu_loc,
offset,
mode,
pred_cu,
u_pred,
v_pred,
@ -1653,12 +1633,12 @@ int8_t uvg_search_intra_chroma_rdo(
state->search_cabac.update = 1;
chroma_data[mode_i].cost = mode_bits * state->lambda;
uvg_intra_recon_cu(state,
&chroma_data[mode_i], &loc,
&chroma_data[mode_i], cu_loc,
pred_cu, lcu,
tree_type,
false,
true);
chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
}
}
@ -1677,14 +1657,11 @@ int8_t uvg_search_intra_chroma_rdo(
int8_t uvg_search_cu_intra_chroma(
encoder_state_t * const state,
const int x_px,
const int y_px,
const int depth,
const cu_loc_t* const cu_loc,
lcu_t *lcu,
intra_search_data_t *search_data,
enum uvg_tree_type tree_type)
{
const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
const cu_info_t *cur_pu = &search_data->pred_cu;
int8_t intra_mode = !cur_pu->intra.mip_flag ? cur_pu->intra.mode : 0;
@ -1698,6 +1675,9 @@ int8_t uvg_search_cu_intra_chroma(
}
}
cu_loc_t chroma_loc;
uvg_cu_loc_ctor(&chroma_loc, cu_loc->x & ~7, cu_loc->y & ~7, cu_loc->width, cu_loc->height);
// The number of modes to select for slower chroma search. Luma mode
// is always one of the modes, so 2 means the final decision is made
// between luma mode and one other mode that looks the best
@ -1715,7 +1695,7 @@ int8_t uvg_search_cu_intra_chroma(
chroma_data[i].pred_cu = *cur_pu;
chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? intra_mode : modes[i];
chroma_data[i].cost = 0;
if(depth != 4 && tree_type == UVG_BOTH_T) {
if(cu_loc->width != 4 && tree_type == UVG_BOTH_T) {
memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3);
}
}
@ -1726,16 +1706,13 @@ int8_t uvg_search_cu_intra_chroma(
if(state->encoder_control->cfg.cclm && 0){
num_modes = search_intra_chroma_rough(state, x_px, y_px, depth,
&lcu_px,
chroma_data,
lcu,
intra_mode,
tree_type);
num_modes = search_intra_chroma_rough(state, chroma_data, lcu, intra_mode,
tree_type,
&chroma_loc);
}
if (num_modes > 1 || state->encoder_control->cfg.jccr) {
uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode, tree_type);
uvg_search_intra_chroma_rdo(state, num_modes, lcu, chroma_data, intra_mode, tree_type, &chroma_loc);
}
else if(cur_pu->lfnst_idx) {
chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx;
@ -1983,7 +1960,7 @@ void uvg_search_cu_intra(
// Set transform depth to current depth, meaning no transform splits.
{
const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
}
// Refine results with slower search or get some results if rough search was skipped.
const int32_t rdo_level = state->encoder_control->cfg.rdo;

View file

@ -52,9 +52,7 @@ double uvg_chroma_mode_bits(const encoder_state_t *state,
int8_t uvg_search_cu_intra_chroma(
encoder_state_t * const state,
const int x_px,
const int y_px,
const int depth,
const cu_loc_t* const cu_loc,
lcu_t *lcu,
intra_search_data_t* best_cclm,
enum uvg_tree_type tree_type);

View file

@ -709,7 +709,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
uvg_rdoq(state, coeff, coeff_out, width, height, color,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
}
else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,

View file

@ -54,7 +54,7 @@
void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
const cu_loc_t *cu_loc,
const cu_loc_t * const cu_loc,
uint8_t color,
int8_t scan_mode,
cu_info_t* cur_cu,
@ -80,8 +80,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
// CONSTANTS
const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
const uint8_t log2_block_width = uvg_g_convert_to_log2[width];
const uint8_t log2_block_height = uvg_g_convert_to_log2[height];
const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);

View file

@ -44,7 +44,7 @@
void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
const cu_loc_t *loc,
const cu_loc_t * const loc,
uint8_t color,
int8_t scan_mode,
cu_info_t* cur_cu,

View file

@ -317,8 +317,7 @@ int uvg_quant_cbcr_residual_generic(
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
cur_cu->cr_lfnst_idx);
scan_order, cur_cu->type, cur_cu->cbf, cur_cu->cr_lfnst_idx);
}
else if (state->encoder_control->cfg.rdoq_enable && false) {
uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
@ -499,8 +498,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
uvg_rdoq(state, coeff, coeff_out, width, height, color,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
lfnst_index);
scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
} else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
scan_order);

View file

@ -49,7 +49,7 @@
typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
const cu_loc_t *loc,
const cu_loc_t * const loc,
uint8_t color,
int8_t scan_mode,
cu_info_t* cur_cu,

View file

@ -434,8 +434,7 @@ static void quantize_chroma(
(transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
{
uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
scan_order, CU_INTRA, depth, 0,
lfnst_idx);
scan_order, CU_INTRA, 0, lfnst_idx);
int j;
for (j = 0; j < width * height; ++j) {
@ -449,8 +448,7 @@ static void quantize_chroma(
uint16_t temp_cbf = 0;
if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
scan_order, CU_INTRA, depth, temp_cbf,
lfnst_idx);
scan_order, CU_INTRA, temp_cbf, lfnst_idx);
}
}
@ -486,12 +484,10 @@ static void quantize_chroma(
void uvg_chroma_transform_search(
encoder_state_t* const state,
int depth,
lcu_t* const lcu,
cabac_data_t* temp_cabac,
const cu_loc_t* const cu_loc,
const int offset,
const uint8_t mode,
cu_info_t* pred_cu,
uvg_pixel u_pred[1024],
uvg_pixel v_pred[1024],
@ -507,6 +503,8 @@ void uvg_chroma_transform_search(
const int width = cu_loc->chroma_width;
const int height = cu_loc->chroma_height;
const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
uvg_transform2d(
state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
);
@ -553,8 +551,6 @@ void uvg_chroma_transform_search(
coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
int16_t v_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
const coeff_scan_order_t scan_order =
uvg_get_scan_order(pred_cu->type, mode, depth);
bool u_has_coeffs = false;
bool v_has_coeffs = false;
if(pred_cu->cr_lfnst_idx) {
@ -575,13 +571,13 @@ void uvg_chroma_transform_search(
i,
u_quant_coeff,
v_quant_coeff,
scan_order,
SCAN_DIAG,
&u_has_coeffs,
&v_has_coeffs,
pred_cu->cr_lfnst_idx);
if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (cu_loc->width == 4 || tree_type == UVG_CHROMA_T)) {
bool constraints[2] = { false, false };
uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
if(!IS_JCCR_MODE(transforms[i])) {
@ -593,9 +589,9 @@ void uvg_chroma_transform_search(
if (IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;
if (u_has_coeffs) {
uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
pred_cu->type, transforms[i] == CHROMA_TS);
if (transforms[i] != CHROMA_TS) {
if (pred_cu->cr_lfnst_idx) {
uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
@ -606,6 +602,7 @@ void uvg_chroma_transform_search(
else {
uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height);
}
if (transforms[i] != JCCR_1) {
for (int j = 0; j < width * height; j++) {
u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((uvg_pixel)(u_pred[j] + u_recon_resi[j]));
@ -620,9 +617,12 @@ void uvg_chroma_transform_search(
else {
uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width);
}
if (v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) {
uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
pred_cu->type, transforms[i] == CHROMA_TS);
if (transforms[i] != CHROMA_TS) {
if (pred_cu->cr_lfnst_idx) {
uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
@ -633,6 +633,7 @@ void uvg_chroma_transform_search(
else {
uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height);
}
for (int j = 0; j < width * height; j++) {
v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
}
@ -700,7 +701,7 @@ void uvg_chroma_transform_search(
pred_cu,
cu_loc,
COLOR_U,
scan_order,
SCAN_DIAG,
transforms[i] == CHROMA_TS,
COEFF_ORDER_LINEAR);
u_bits += coeff_cost;
@ -717,7 +718,7 @@ void uvg_chroma_transform_search(
pred_cu,
cu_loc,
COLOR_V,
scan_order,
SCAN_DIAG,
transforms[i] == CHROMA_TS,
COEFF_ORDER_LINEAR);
}

View file

@ -104,12 +104,10 @@ void uvg_quantize_lcu_residual(
void uvg_chroma_transform_search(
encoder_state_t* const state,
int depth,
lcu_t* const lcu,
cabac_data_t* temp_cabac,
const cu_loc_t* const cu_loc,
const int offset,
const uint8_t mode,
cu_info_t* pred_cu,
uvg_pixel u_pred[1024],
uvg_pixel v_pred[1024],

View file

@ -6,10 +6,10 @@ set -eu
cabacfile="$(mktemp)"
valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
python3 check_cabac_state_consistency.py "${cabacfile}"
valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
python3 check_cabac_state_consistency.py "${cabacfile}"
rm -rf "${cabacfile}"