From 626c9b02eaf45d6e5923a351a9ce879ecde546d9 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Aug 2022 13:23:27 +0300 Subject: [PATCH] [isp] Modify transform and quantization functions to handle non-square blocks. Add strategy headers to CMakelist. --- CMakeLists.txt | 2 +- src/cu.h | 4 +- src/search.c | 28 +++++----- src/search_inter.c | 2 + src/search_intra.c | 15 +++--- src/strategies/avx2/dct-avx2.c | 4 +- src/strategies/avx2/intra-avx2.c | 3 ++ src/strategies/avx2/picture-avx2.c | 4 +- src/strategies/avx2/quant-avx2.c | 12 ++--- src/strategies/generic/dct-generic.c | 18 +++++-- src/strategies/generic/picture-generic.c | 4 +- src/strategies/generic/quant-generic.c | 35 ++++++------ src/strategies/generic/quant-generic.h | 3 +- src/strategies/strategies-dct.c | 10 +++- src/strategies/strategies-dct.h | 4 +- src/strategies/strategies-picture.h | 2 +- src/strategies/strategies-quant.h | 20 +++++-- src/transform.c | 68 ++++++++++++++---------- src/transform.h | 5 +- tests/mts_tests.c | 6 +-- 20 files changed, 153 insertions(+), 96 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c0ec99c7..ab0b63a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c") list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h") # Add also all the strategies -file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c") +file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c") # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c") diff --git a/src/cu.h b/src/cu.h index 6fe960e7..f5eeb5e6 100644 --- a/src/cu.h +++ b/src/cu.h @@ -415,9 +415,9 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu */ static INLINE void copy_coeffs(const coeff_t *__restrict src, coeff_t *__restrict dest, - size_t width) + size_t width, size_t height) { - memcpy(dest, src, width * width * sizeof(coeff_t)); + memcpy(dest, src, width * height * sizeof(coeff_t)); } diff --git a/src/search.c b/src/search.c index 4fbf33f3..56e07b06 100644 --- a/src/search.c +++ b/src/search.c @@ -89,20 +89,20 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr } } -static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint, enum +static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum uvg_tree_type tree_type) { if (tree_type != UVG_CHROMA_T) { - const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local); - copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width); + const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y); + copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], cu_loc->width, cu_loc->height); } if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { - const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> (tree_type != UVG_CHROMA_T), y_local >> (tree_type != UVG_CHROMA_T)); - copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1); - copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1); + const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T)); + copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height); + copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height); if (joint) { - copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1); + copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], cu_loc->chroma_width, cu_loc->chroma_height); } } } @@ -114,9 +114,11 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t uvg_tree_type tree_type) { const int width = LCU_WIDTH >> depth; + cu_loc_t loc; + uvg_cu_loc_ctor(&loc, x_local, y_local, width, width); copy_cu_info (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]); copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type); - copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint, tree_type); + copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type); } @@ -1093,7 +1095,7 @@ static double search_cu( } cu_loc_t loc; - const int width = LCU_WIDTH << depth; + const int width = LCU_WIDTH >> depth; const int height = width; // TODO: height for non-square blocks uvg_cu_loc_ctor(&loc, x, y, width, height); uvg_quantize_lcu_residual(state, @@ -1579,7 +1581,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type); // Copy coeffs to encoder state. - copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH); + copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH); if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) { cost = search_cu( @@ -1596,9 +1598,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T); } - copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C); - copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C); + copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C); + copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C); if (state->encoder_control->cfg.jccr) { - copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C); + copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C); } } diff --git a/src/search_inter.c b/src/search_inter.c index 7922f34b..ff511740 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2225,6 +2225,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, u_pred, u_resi, width, + height, LCU_WIDTH_C, width); uvg_generate_residual( @@ -2232,6 +2233,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, v_pred, v_resi, width, + height, LCU_WIDTH_C, width); diff --git a/src/search_intra.c b/src/search_intra.c index f3c8c838..06b86cc7 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -249,8 +249,11 @@ static void derive_mts_constraints(cu_info_t *const pred_cu, // ISP_TODO: move this function if it is used elsewhere -bool can_use_isp(const int width, const int height, const int max_tr_size) +static INLINE bool can_use_isp(const int width, const int height, const int max_tr_size) { + assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size."); + assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH."); + const int log2_width = uvg_g_convert_to_bit[width] + 2; const int log2_height = uvg_g_convert_to_bit[height] + 2; @@ -300,16 +303,14 @@ int uvg_get_isp_split_dim(const int width, const int height, const int split_typ // ISP_TODO: move this function if it is used elsewhere -bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode) +static INLINE bool can_use_isp_with_lfnst(const int width, const int height, const int isp_mode) { if (isp_mode == ISP_MODE_NO_ISP) { return false; } const int tu_width = isp_mode == ISP_MODE_HOR ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER); const int tu_height = isp_mode == ISP_MODE_HOR ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR) : height; - - // ISP_TODO: make a define for this or use existing - const int min_tb_size = 4; + const int min_tb_size = TR_MIN_WIDTH; if (!(tu_width >= min_tb_size && tu_height >= min_tb_size)) { return false; @@ -1449,7 +1450,7 @@ static int8_t search_intra_rdo( enum uvg_tree_type tree_type) { const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra); - const int width = LCU_WIDTH << depth; + const int width = LCU_WIDTH >> depth; const int height = width; // TODO: height for non-square blocks for (int mode = 0; mode < modes_to_check; mode++) { @@ -1633,6 +1634,7 @@ int8_t uvg_search_intra_chroma_rdo( u_pred, u_resi, width, + height, LCU_WIDTH_C, width); uvg_generate_residual( @@ -1640,6 +1642,7 @@ int8_t uvg_search_intra_chroma_rdo( v_pred, v_resi, width, + height, LCU_WIDTH_C, width); uvg_chorma_ts_out_t chorma_ts_out; diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index b695273b..f3c812ed 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -1590,18 +1590,20 @@ static void mts_dct_avx2( const color_t color, const cu_info_t* tu, const int8_t width, + const int8_t height, const int16_t* input, int16_t* output, const int8_t mts_idx) { tr_type_t type_hor; tr_type_t type_ver; + // ISP_TODO: height passed but not used uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx); if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx) { - dct_func* dct_func = uvg_get_dct_func(width, color, tu->type); + dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type); dct_func(bitdepth, input, output); } else diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 79e60def..fc19654a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -61,6 +61,7 @@ static void uvg_angular_pred_avx2( uvg_pixel *const dst, const uint8_t multi_ref_idx) { + // ISP_TODO: non-square block implementation, height is passed but not used const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int log2_width = uvg_g_convert_to_bit[width] + 2; @@ -512,6 +513,7 @@ static void uvg_intra_pred_planar_avx2( const uint8_t *const ref_left, uint8_t *const dst) { + // ISP_TODO: non-square block implementation, height is passed but not used const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int log2_width = uvg_g_convert_to_bit[width] + 2; @@ -977,6 +979,7 @@ static void uvg_pdpc_planar_dc_avx2( const uvg_intra_ref *const used_ref, uvg_pixel *const dst) { + // ISP_TODO: non-square block implementation, height is passed but not used assert(mode == 0 || mode == 1); // planar or DC const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index df90f149..a911928d 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -1743,8 +1743,8 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t* return diff; } -static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) { - +static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) { + // ISP_TODO: non-square block implementation, height is passed but not used __m128i diff = _mm_setzero_si128(); switch (width) { case 4: diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index 5c39fe11..8313b1f0 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -626,7 +626,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, * \returns Whether coeff_out contains any non-zero coefficients. */ int uvg_quantize_residual_avx2(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uint8_t *const ref_in, const uint8_t *const pred_in, @@ -637,15 +637,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, // Temporary arrays to pass data to and from uvg_quant and transform functions. ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH]; - - const int height = width; // TODO: height for non-square blocks + // ISP_TODO: non-square block implementation, height is passed but not used + int has_coeffs = 0; assert(width <= TR_MAX_WIDTH); assert(width >= TR_MIN_WIDTH); // Get residual. (ref_in - pred_in -> residual) - uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride); + uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride); if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { int y, x; @@ -662,10 +662,10 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, // Transform residual. (residual -> coeff) if (use_trskip) { - uvg_transformskip(state->encoder_control, residual, coeff, width); + uvg_transformskip(state->encoder_control, residual, coeff, width, height); } else { - uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu); + uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu); } const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx; diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c index cd05a01f..00562737 100644 --- a/src/strategies/generic/dct-generic.c +++ b/src/strategies/generic/dct-generic.c @@ -739,6 +739,11 @@ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, const int16_t *inp partial_butterfly_inverse_ ## n ## _generic(tmp, output, shift_2nd); \ } +static void dct_non_square_generic(int8_t bitdepth, const int16_t* input, int16_t* output) +{ + // ISP_TODO: non-square transform here +} + DCT_NXN_GENERIC(4); DCT_NXN_GENERIC(8); DCT_NXN_GENERIC(16); @@ -2487,26 +2492,28 @@ static void mts_dct_generic( const color_t color, const cu_info_t* tu, const int8_t width, + const int8_t height, const int16_t* input, int16_t* output, const int8_t mts_idx) { tr_type_t type_hor; tr_type_t type_ver; + // ISP_TODO: height passed but not used uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx); - if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx) + if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx || width != height) { - dct_func *dct_func = uvg_get_dct_func(width, color, tu->type); + dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type); dct_func(bitdepth, input, output); } else { - const int height = width; int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0); int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); const int log2_width_minus2 = uvg_g_convert_to_bit[width]; + const int log2_height_minus2 = uvg_g_convert_to_bit[height]; if(tu->lfnst_idx || tu->cr_lfnst_idx) { if ((width == 4 && height > 4) || (width > 4 && height == 4)) { @@ -2521,11 +2528,11 @@ static void mts_dct_generic( } partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2]; - partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2]; + partial_tr_func* dct_ver = dct_table[type_ver][log2_height_minus2]; int16_t tmp[32 * 32]; const int32_t shift_1st = log2_width_minus2 + bitdepth - 7; - const int32_t shift_2nd = log2_width_minus2 + 8; + const int32_t shift_2nd = log2_height_minus2 + 8; dct_hor(input, tmp, shift_1st, height, 0, skip_width); dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height); @@ -2582,6 +2589,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic); success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic); success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic); + success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic); success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic); diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index 817befed..6797a669 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -783,10 +783,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len) static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, - int width, int ref_stride, int pred_stride) + int width, int height, int ref_stride, int pred_stride) { int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]); } diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index 96d2567a..03d4daf8 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -237,6 +237,7 @@ int uvg_quant_cbcr_residual_generic( encoder_state_t* const state, const cu_info_t* const cur_cu, const int width, + const int height, const coeff_scan_order_t scan_order, const int in_stride, const int out_stride, const uvg_pixel* const u_ref_in, @@ -247,28 +248,28 @@ int uvg_quant_cbcr_residual_generic( uvg_pixel* v_rec_out, coeff_t* coeff_out, bool early_skip, - int lmcs_chroma_adj, enum uvg_tree_type tree_type - ) { + int lmcs_chroma_adj, enum uvg_tree_type tree_type) +{ ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH]; - + // ISP_TODO: this function is not fully converted to handle non-square blocks { int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]); v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]); } } } - uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride); - uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride); + uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride); + uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride); const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1); - for (int y = 0; y < width; y++) + for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { @@ -305,9 +306,9 @@ int uvg_quant_cbcr_residual_generic( } - uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu); + uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu); if(cur_cu->cr_lfnst_idx) { - uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type); + uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type); } if (state->encoder_control->cfg.rdoq_enable && @@ -441,7 +442,7 @@ int uvg_quant_cbcr_residual_generic( * \returns Whether coeff_out contains any non-zero coefficients. */ int uvg_quantize_residual_generic(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, @@ -454,19 +455,17 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, int has_coeffs = 0; - assert(width <= TR_MAX_WIDTH); - assert(width >= TR_MIN_WIDTH); - - const int height = width; // TODO: height for non-square blocks + assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH); + assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH); // Get residual. (ref_in - pred_in -> residual) - uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride); + uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride); if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { int y, x; int sign, absval; int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { sign = residual[x + y * width] >= 0 ? 1 : -1; absval = sign * residual[x + y * width]; @@ -477,10 +476,10 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, // Transform residual. (residual -> coeff) if (use_trskip) { - uvg_transformskip(state->encoder_control, residual, coeff, width); + uvg_transformskip(state->encoder_control, residual, coeff, width, height); } else { - uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu); + uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu); } const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx; diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h index da2b05ae..ba1fa130 100644 --- a/src/strategies/generic/quant-generic.h +++ b/src/strategies/generic/quant-generic.h @@ -60,7 +60,7 @@ void uvg_quant_generic( uint8_t lfnst_idx); int uvg_quantize_residual_generic(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, @@ -71,6 +71,7 @@ int uvg_quant_cbcr_residual_generic( encoder_state_t* const state, const cu_info_t* const cur_cu, const int width, + const int height, const coeff_scan_order_t scan_order, const int in_stride, const int out_stride, const uvg_pixel* const u_ref_in, diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c index 4ba2a37b..07f0fcb4 100644 --- a/src/strategies/strategies-dct.c +++ b/src/strategies/strategies-dct.c @@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0; dct_func * uvg_dct_8x8 = 0; dct_func * uvg_dct_16x16 = 0; dct_func * uvg_dct_32x32 = 0; +dct_func * uvg_dct_non_square = 0; dct_func * uvg_fast_inverse_dst_4x4 = 0; @@ -56,9 +57,11 @@ void(*uvg_mts_dct)(int8_t bitdepth, color_t color, const cu_info_t *tu, int8_t width, + int8_t height, const int16_t *input, int16_t *output, const int8_t mts_idx); + void(*uvg_mts_idct)(int8_t bitdepth, color_t color, const cu_info_t *tu, @@ -90,8 +93,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) { * * \returns Pointer to the function. */ -dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type) +dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type) { + if (width != height) { + // Non-square block. Return generic dct for non-square blokcs. + assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function."); + return uvg_dct_non_square; + } switch (width) { case 4: //if (color == COLOR_Y && type == CU_INTRA) { diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h index d58bf5a9..50cc3b5a 100644 --- a/src/strategies/strategies-dct.h +++ b/src/strategies/strategies-dct.h @@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4; extern dct_func * uvg_dct_8x8; extern dct_func * uvg_dct_16x16; extern dct_func * uvg_dct_32x32; +extern dct_func * uvg_dct_non_square; extern dct_func * uvg_fast_inverse_dst_4x4; @@ -64,6 +65,7 @@ typedef void (mts_dct_func)( color_t color, const cu_info_t* tu, int8_t width, + int8_t height, const int16_t* input, int16_t* output, const int8_t mts_idx); @@ -82,7 +84,7 @@ typedef void (mts_idct_func)( extern mts_idct_func* uvg_mts_idct; int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth); -dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type); +dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type); dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type); diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index 88f52cfc..8d73f74c 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu, typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len); -typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride); +typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride); extern const uint32_t uvg_crc_table[256]; diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h index a6c9a3d4..2920ed82 100644 --- a/src/strategies/strategies-quant.h +++ b/src/strategies/strategies-quant.h @@ -45,12 +45,23 @@ #include "tables.h" // Declare function pointers. -typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, - int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx); +typedef unsigned (quant_func)( + const encoder_state_t * const state, + coeff_t *coef, + coeff_t *q_coef, + int32_t width, + int32_t height, + color_t color, + int8_t scan_idx, + int8_t block_type, + int8_t transform_skip, + uint8_t lfnst_idx); + typedef unsigned (quant_cbcr_func)( encoder_state_t* const state, const cu_info_t* const cur_cu, const int width, + const int height, const coeff_scan_order_t scan_order, const int in_stride, const int out_stride, const uvg_pixel* const u_ref_in, @@ -63,15 +74,18 @@ typedef unsigned (quant_cbcr_func)( bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type); + typedef unsigned (quant_residual_func)(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, uvg_pixel *rec_out, coeff_t *coeff_out, bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type); + typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height, color_t color, int8_t block_type, int8_t transform_skip); + typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights); typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length); diff --git a/src/transform.c b/src/transform.c index abf793c2..0f73eeeb 100644 --- a/src/transform.c +++ b/src/transform.c @@ -77,6 +77,7 @@ const uint8_t uvg_g_chroma_scale[58]= * Parameters pred_in and rec_out may be aliased. * * \param width Transform width. + * \param height Transform height. * \param in_stride Stride for ref_in and pred_in * \param out_stride Stride for rec_out. * \param ref_in Reference pixels. @@ -87,6 +88,7 @@ const uint8_t uvg_g_chroma_scale[58]= * \returns Whether coeff_out contains any non-zero coefficients. */ static bool bypass_transquant(const int width, + const int height, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, @@ -96,7 +98,7 @@ static bool bypass_transquant(const int width, { bool nonzero_coeffs = false; - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { int32_t in_idx = x + y * in_stride; int32_t out_idx = x + y * out_stride; @@ -123,6 +125,7 @@ static bool bypass_transquant(const int width, * \param coeff coefficients (residual) to filter */ static void rdpcm(const int width, + const int height, const rdpcm_dir dir, coeff_t *coeff) { @@ -130,7 +133,7 @@ static void rdpcm(const int width, const int min_x = (dir == RDPCM_HOR) ? 1 : 0; const int min_y = (dir == RDPCM_HOR) ? 0 : 1; - for (int y = width - 1; y >= min_y; y--) { + for (int y = height - 1; y >= min_y; y--) { for (int x = width - 1; x >= min_x; x--) { const int index = x + y * width; coeff[index] -= coeff[index - offset]; @@ -203,17 +206,18 @@ void uvg_derive_lfnst_constraints( /** * \brief NxN inverse transform (2D) - * \param coeff input data (transform coefficients) - * \param block output data (residual) - * \param block_size input data (width of transform) + * \param coeff input data (transform coefficients) + * \param block output data (residual) + * \param width transform width + * \param height transform height */ -void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size) +void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height) { - int32_t j,k; - for (j = 0; j < block_size; j++) { - for(k = 0; k < block_size; k ++) { + int32_t j, k; + for (j = 0; j < height; j++) { + for(k = 0; k < width; k ++) { // Casting back and forth to make UBSan not trigger due to left-shifting negatives - coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k])); + coeff[j * width + k] = (int16_t)((uint16_t)(block[j * width + k])); } } } @@ -243,17 +247,18 @@ void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block, void uvg_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, - int8_t block_size, + int8_t block_width, + int8_t block_height, color_t color, const cu_info_t *tu) { - if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx) + if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx || block_width != block_height) { - uvg_mts_dct(encoder->bitdepth, color, tu, block_size, block, coeff, encoder->cfg.mts); + uvg_mts_dct(encoder->bitdepth, color, tu, block_width, block_height, block, coeff, encoder->cfg.mts); } else { - dct_func *dct_func = uvg_get_dct_func(block_size, color, tu->type); + dct_func *dct_func = uvg_get_dct_func(block_width, block_height, color, tu->type); dct_func(encoder->bitdepth, block, coeff); } } @@ -373,6 +378,7 @@ static void generate_jccr_transforms( &temp_resi[(cbf_mask1 - 1) * trans_offset], &u_coeff[*num_transforms * trans_offset], width, + height, COLOR_U, pred_cu ); @@ -386,6 +392,7 @@ static void generate_jccr_transforms( &temp_resi[(cbf_mask2 - 1) * trans_offset], &u_coeff[*num_transforms * trans_offset], width, + height, COLOR_U, pred_cu ); @@ -492,10 +499,10 @@ void uvg_chroma_transform_search( ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2]; ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5]; uvg_transform2d( - state->encoder_control, u_resi, u_coeff, width, COLOR_U, pred_cu + state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu ); uvg_transform2d( - state->encoder_control, v_resi, v_coeff, width, COLOR_V, pred_cu + state->encoder_control, v_resi, v_coeff, width, height, COLOR_V, pred_cu ); enum uvg_chroma_transforms transforms[5]; transforms[0] = DCT7_CHROMA; @@ -508,8 +515,8 @@ void uvg_chroma_transform_search( pred_cu->cr_lfnst_idx == 0 ; if (can_use_tr_skip) { - uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width); - uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width); + uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width, height); + uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width, height); transforms[num_transforms] = CHROMA_TS; num_transforms++; } @@ -1053,7 +1060,7 @@ void uvg_inv_lfnst( */ int uvg_quantize_residual_trskip( encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, int8_t *trskip_out, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, @@ -1074,7 +1081,7 @@ int uvg_quantize_residual_trskip( //noskip.cost += uvg_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost; skip.has_coeffs = uvg_quantize_residual( - state, cur_cu, width, color, scan_order, + state, cur_cu, width, height, color, scan_order, 1, in_stride, width, ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj, UVG_BOTH_T /* tree type doesn't matter for transformskip*/); @@ -1090,9 +1097,9 @@ int uvg_quantize_residual_trskip( if (best->has_coeffs || rec_out != pred_in) { // If there is no residual and reconstruction is already in rec_out, // we can skip this. - uvg_pixels_blit(best->rec, rec_out, width, width, width, out_stride); + uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride); } - copy_coeffs(best->coeff, coeff_out, width); + copy_coeffs(best->coeff, coeff_out, width, height); return best->has_coeffs; } @@ -1131,8 +1138,8 @@ static void quantize_tr_residual( // This should ensure that the CBF data doesn't get corrupted if this function // is called more than once. - int32_t tr_width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int32_t tr_width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int32_t lcu_width = LCU_WIDTH >> shift; const int8_t mode = @@ -1183,7 +1190,9 @@ static void quantize_tr_residual( } if (cfg->lossless) { + // ISP_TODO: is there any sensible case where in and out strides would be different? has_coeffs = bypass_transquant(tr_width, + tr_height, lcu_width, // in stride lcu_width, // out stride ref, @@ -1193,9 +1202,9 @@ static void quantize_tr_residual( if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) { // implicit rdpcm for horizontal and vertical intra modes if (mode == 18) { - rdpcm(tr_width, RDPCM_HOR, coeff); + rdpcm(tr_width, tr_height, RDPCM_HOR, coeff); } else if (mode == 50) { - rdpcm(tr_width, RDPCM_VER, coeff); + rdpcm(tr_width, tr_height, RDPCM_VER, coeff); } } @@ -1206,6 +1215,7 @@ static void quantize_tr_residual( has_coeffs = uvg_quantize_residual_trskip(state, cur_pu, tr_width, + tr_height, color, scan_idx, &tr_skip, @@ -1222,6 +1232,7 @@ static void quantize_tr_residual( state, cur_pu, tr_width, + tr_height, scan_idx, lcu_width, lcu_width, @@ -1240,6 +1251,7 @@ static void quantize_tr_residual( has_coeffs = uvg_quantize_residual(state, cur_pu, tr_width, + tr_height, color, scan_idx, false, // tr skip @@ -1326,8 +1338,8 @@ void uvg_quantize_lcu_residual( const int offset = width / 2; for (int j = 0; j < 2; ++j) { for (int i = 0; i < 2; ++i) { - const cu_loc_t loc; - uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width, height); + cu_loc_t loc; + uvg_cu_loc_ctor(&loc, (x + i * offset), (y + j * offset), width >> 1, height >> 1); // jccr is currently not supported if transform is split uvg_quantize_lcu_residual(state, luma, chroma, 0, &loc, depth + 1, NULL, lcu, early_skip, tree_type); } diff --git a/src/transform.h b/src/transform.h index 61c50c04..a7b6e221 100644 --- a/src/transform.h +++ b/src/transform.h @@ -47,13 +47,14 @@ extern const uint8_t uvg_g_chroma_scale[58]; extern const int16_t uvg_g_inv_quant_scales[6]; extern const int16_t uvg_g_quant_scales[6]; -void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size); +void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height); void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size); void uvg_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, - int8_t block_size, + int8_t block_width, + int8_t block_height, color_t color, const cu_info_t *tu); diff --git a/tests/mts_tests.c b/tests/mts_tests.c index f607b77d..2a132c77 100644 --- a/tests/mts_tests.c +++ b/tests/mts_tests.c @@ -111,7 +111,7 @@ static void setup_tests() tu.tr_idx = MTS_DST7_DST7 + trafo; tu.lfnst_idx = 0; tu.cr_lfnst_idx = 0; - mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH); + mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH); } } } @@ -167,7 +167,7 @@ TEST dct(void) int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize]; ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 }; - test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); + test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) { ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]); @@ -192,7 +192,7 @@ TEST idct(void) int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize]; ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 }; - test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); + test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) { ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);