Merge branch 'release-prep' into master

This commit is contained in:
Joose Sainio 2023-09-27 08:11:09 +03:00
commit 84580aebb0
79 changed files with 23515 additions and 3642 deletions

View file

@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")
# Add also all the strategies
file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")
# ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC
list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c")
@ -340,6 +340,9 @@ if(NOT DEFINED MSVC)
if(NOT "test_external_symbols" IN_LIST XFAIL)
add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
endif()
if(NOT "test_mtt" IN_LIST XFAIL)
add_test( NAME test_mtt COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mtt.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
endif()
if(NOT "test_intra" IN_LIST XFAIL)
add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
endif()

View file

@ -77,6 +77,8 @@ typedef struct
cabac_ctx_t mts_idx_model[4];
cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models
cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models
cabac_ctx_t mtt_vertical_model[5];
cabac_ctx_t mtt_binary_model[4];
cabac_ctx_t intra_luma_mpm_flag_model; //!< \brief intra mode context models
cabac_ctx_t intra_subpart_model[2]; //!< \brief intra sub part context models
cabac_ctx_t chroma_pred_model;

100
src/cfg.c
View file

@ -80,7 +80,6 @@ int uvg_config_init(uvg_config *cfg)
cfg->trskip_max_size = 2; //Default to 4x4
cfg->mts = 0;
cfg->mts_implicit = 0;
cfg->tr_depth_intra = 0;
cfg->ime_algorithm = 0; /* hexbs */
cfg->fme_level = 4;
cfg->source_scan_type = 0; /* progressive */
@ -207,6 +206,8 @@ int uvg_config_init(uvg_config *cfg)
cfg->lfnst = false;
cfg->isp = false;
parse_qp_map(cfg, 0);
cfg->jccr = 0;
@ -221,10 +222,27 @@ int uvg_config_init(uvg_config *cfg)
cfg->cabac_debug_file_name = NULL;
cfg->dual_tree = 0;
cfg->min_qt_size[0] = 4;
cfg->min_qt_size[1] = 4;
cfg->min_qt_size[2] = 4;
cfg->max_btt_depth[0] = 0;
cfg->max_btt_depth[1] = 0;
cfg->max_btt_depth[2] = 0;
cfg->max_tt_size[0] = 64;
cfg->max_bt_size[0] = 64;
cfg->max_tt_size[1] = 64;
cfg->max_bt_size[1] = 64;
cfg->max_tt_size[2] = 64;
cfg->max_bt_size[2] = 64;
cfg->intra_rough_search_levels = 2;
cfg->ibc = 0;
cfg->dep_quant = 0;
return 1;
}
@ -333,7 +351,7 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil
return 1;
}
/*
static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
{
char *tail;
@ -349,7 +367,7 @@ static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
return 1;
}
}
*/
static int parse_int8(const char *numstr,int8_t* number,int min, int max)
{
char *tail;
@ -365,7 +383,7 @@ static int parse_int8(const char *numstr,int8_t* number,int min, int max)
return 1;
}
}
/*
static int parse_array(const char *array, uint8_t *coeff_key, int size,
int min, int max)
{
@ -389,15 +407,15 @@ static int parse_array(const char *array, uint8_t *coeff_key, int size,
free(key);
return 0;
}
else if (i<size){
fprintf(stderr, "parsing failed : too few members.\n");
//else if (i<size){
// fprintf(stderr, "parsing failed : too few members.\n");
// free(key);
// return 0;
//}
free(key);
return 0;
}
free(key);
return 1;
return i;
}
*/
static int parse_qp_scale_array(const char *array, int8_t *out)
{
@ -928,8 +946,6 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
cfg->mts = mts_type;
cfg->mts_implicit = (mts_type == UVG_MTS_IMPLICIT);
}
else if OPT("tr-depth-intra")
cfg->tr_depth_intra = atoi(value);
else if OPT("me") {
int8_t ime_algorithm = 0;
if (!parse_enum(value, me_names, &ime_algorithm)) return 0;
@ -1454,6 +1470,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
else if OPT("lfnst") {
cfg->lfnst = atobool(value);
}
else if OPT("isp") {
cfg->isp = atobool(value);
}
else if OPT("jccr") {
cfg->jccr = (bool)atobool(value);
}
@ -1479,6 +1498,49 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
else if OPT("dual-tree") {
cfg->dual_tree = atobool(value);
}
else if OPT("mtt-depth-intra") {
cfg->max_btt_depth[0] = atoi(value);
}
else if OPT("mtt-depth-intra-chroma") {
cfg->max_btt_depth[2] = atoi(value);
}
else if OPT("mtt-depth-inter") {
cfg->max_btt_depth[1] = atoi(value);
}
else if OPT("max-bt-size") {
uint8_t sizes[3];
const int got = parse_array(value, sizes, 3, 0, 128);
if (got == 1) {
cfg->max_bt_size[0] = sizes[0];
cfg->max_bt_size[1] = sizes[0];
cfg->max_bt_size[2] = sizes[0];
}
else if (got == 3) {
cfg->max_bt_size[0] = sizes[0];
cfg->max_bt_size[1] = sizes[1];
cfg->max_bt_size[2] = sizes[2];
} else {
fprintf(stderr, "Incorrect amount of values provided for max-bt-size\n");
return 0;
}
}
else if OPT("max-tt-size") {
uint8_t sizes[3];
const int got = parse_array(value, sizes, 3, 0, 128);
if (got == 1) {
cfg->max_tt_size[0] = sizes[0];
cfg->max_tt_size[1] = sizes[0];
cfg->max_tt_size[2] = sizes[0];
}
else if (got == 3) {
cfg->max_tt_size[0] = sizes[0];
cfg->max_tt_size[1] = sizes[1];
cfg->max_tt_size[2] = sizes[2];
} else {
fprintf(stderr, "Incorrect amount of values provided for max-tt-size\n");
return 0;
}
}
else if OPT("intra-rough-granularity") {
cfg->intra_rough_search_levels = atoi(value);
}
@ -1489,7 +1551,11 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
return 0;
}
cfg->ibc = (uint8_t)ibc_value;
} else {
}
else if OPT("dep-quant") {
cfg->dep_quant = (bool)atobool(value);
}
else {
return 0;
}
#undef OPT
@ -1681,12 +1747,6 @@ int uvg_config_validate(const uvg_config *const cfg)
error = 1;
}
if (cfg->tr_depth_intra < 0 || cfg->tr_depth_intra > 4) {
// range is 0 .. CtbLog2SizeY - Log2MinTrafoSize
fprintf(stderr, "Input error: --tr-depth-intra is out of range [0..4]\n");
error = 1;
}
if (cfg->fme_level != 0 && cfg->fme_level > 4) {
fprintf(stderr, "Input error: invalid --subme parameter (must be in range 0-4)\n");
error = 1;

View file

@ -76,7 +76,6 @@ static const struct option long_options[] = {
{ "tr-skip-max-size", required_argument, NULL, 0 },
{ "mts", required_argument, NULL, 0 },
{ "no-mts", no_argument, NULL, 0 },
{ "tr-depth-intra", required_argument, NULL, 0 },
{ "me", required_argument, NULL, 0 },
{ "subme", required_argument, NULL, 0 },
{ "source-scan-type", required_argument, NULL, 0 },
@ -178,6 +177,8 @@ static const struct option long_options[] = {
{ "no-mip", no_argument, NULL, 0 },
{ "lfnst", no_argument, NULL, 0 },
{ "no-lfnst", no_argument, NULL, 0 },
{ "isp", no_argument, NULL, 0 },
{ "no-isp", no_argument, NULL, 0 },
{ "jccr", no_argument, NULL, 0 },
{ "no-jccr", no_argument, NULL, 0 },
{ "amvr", no_argument, NULL, 0 },
@ -191,8 +192,15 @@ static const struct option long_options[] = {
{ "dual-tree", no_argument, NULL, 0 },
{ "no-dual-tree", no_argument, NULL, 0 },
{ "cabac-debug-file", required_argument, NULL, 0 },
{ "mtt-depth-intra", required_argument, NULL, 0 },
{ "mtt-depth-inter", required_argument, NULL, 0 },
{ "mtt-depth-intra-chroma", required_argument, NULL, 0 },
{ "max-bt-size", required_argument, NULL, 0 },
{ "max-tt-size", required_argument, NULL, 0 },
{ "intra-rough-granularity",required_argument, NULL, 0 },
{ "ibc", required_argument, NULL, 0 },
{ "dep-quant", no_argument, NULL, 0 },
{ "no-dep-quant", no_argument, NULL, 0 },
{0, 0, 0, 0}
};
@ -571,6 +579,7 @@ void print_help(void)
" - full: Full ALF\n"
" --(no-)rdoq : Rate-distortion optimized quantization [enabled]\n"
" --(no-)rdoq-skip : Skip RDOQ for 4x4 blocks. [disabled]\n"
" --(no-)dep-quant : Use dependent quantization. [disabled]\n"
" --(no-)signhide : Sign hiding [disabled]\n"
" --rd <integer> : Intra mode search complexity [0]\n"
" - 0: Skip intra if inter is good enough.\n"
@ -602,14 +611,14 @@ void print_help(void)
" - 2: + 1/2-pixel diagonal\n"
" - 3: + 1/4-pixel horizontal and vertical\n"
" - 4: + 1/4-pixel diagonal\n"
" --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]\n"
" - 0, 1, 2, 3: from 64x64 to 8x8\n"
" --pu-depth-inter <int>-<int> : Maximum and minimum split depths where\n"
" inter search is performed 0..8. [0-3]\n"
" - Accepts a list of values separated by ','\n"
" for setting separate depths per GOP layer\n"
" (values can be omitted to use the first\n"
" value for the respective layer).\n"
" --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]\n"
" - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
" --pu-depth-intra <int>-<int> : Maximum and minimum split depths where\n"
" intra search is performed 0..8. [1-4]\n"
" - Accepts a list of values separated by ','\n"
" for setting separate depths per GOP layer\n"
" (values can be omitted to use the first\n"
@ -617,6 +626,22 @@ void print_help(void)
" --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n"
" learning trees, overrides the\n"
" --pu-depth-intra parameter. [disabled]\n"
" --mtt-depth-intra : Depth of mtt for intra slices 0..3.[0]\n"
" --mtt-depth-intra-chroma : Depth of mtt for chroma dual tree in\n"
" intra slices 0..3.[0]\n"
" --mtt-depth-inter : Depth of mtt for inter slices 0..3.[0]\n"
" All MTTs are currently experimental and\n"
" require disabling some avx2 optimizations.\n"
" --max-bt-size : maximum size for a CU resulting from\n"
" a bt split. A singular value shared for all\n"
" or a list of three values for the different\n"
" slices types (intra, inter, intra-chroma)\n"
" can be provided. [64, 64, 32]\n"
" --max-tt-size : maximum size for a CU resulting from\n"
" a tt split. A singular value shared for all\n"
" or a list of three values for the different\n"
" slices types (intra, inter, intra-chroma)\n"
" can be provided. [64, 64, 32]\n"
" --intra-rough-granularity : How many levels are used for the\n"
" logarithmic intra rough search. 0..4\n"
" With 0 all of the modes are checked \n"
@ -634,7 +659,6 @@ void print_help(void)
" This is mostly for debugging and is not\n"
" guaranteed to produce sensible bitstream or\n"
" work at all. [disabled]\n"
" --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
" --(no-)bipred : Bi-prediction [disabled]\n"
" --cu-split-termination <string> : CU split search termination [zero]\n"
" - off: Don't terminate early.\n"
@ -671,6 +695,9 @@ void print_help(void)
" --(no-)mip : Enable matrix weighted intra prediction.\n"
" --(no-)lfnst : Enable low frequency non-separable transform.\n"
" [disabled]\n"
" --(no-)isp : Enable intra sub partitions. [disabled]\n"
" Experimental, requires disabling some avx2\n"
" optimizations.\n"
" --mts <string> : Multiple Transform Selection [off].\n"
" (Currently only implemented for intra\n"
" and has effect only when rd >= 2)\n"

View file

@ -50,6 +50,21 @@ static const uint8_t INIT_QT_SPLIT_FLAG[4][6] = {
{ 0, 8, 8, 12, 12, 8, },
};
static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = {
{ 43, 42, 37, 42, 44, },
{ 43, 35, 37, 34, 52, },
{ 43, 42, 29, 27, 44, },
{ 9, 8, 9, 8, 5, },
};
static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = {
{ 28, 29, 28, 29, },
{ 43, 37, 21, 22, },
{ 36, 45, 36, 45, },
{ 12, 13, 12, 13, },
};
static const uint8_t INIT_SKIP_FLAG[4][3] = {
{ 57, 60, 46, },
{ 57, 59, 45, },
@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]);
uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]);
uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]);
uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]);
}
for (i = 0; i < 5; i++) {
uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]);
}
for (i = 0; i < 6; i++) {
@ -618,13 +638,14 @@ void uvg_context_copy(encoder_state_t * const target_state, const encoder_state_
uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,
uint32_t pos_x,
uint32_t pos_y,
int32_t width)
int32_t width,
int32_t height)
{
uint32_t uiRight = 0;
uint32_t uiLower = 0;
uint32_t position = pos_y * width + pos_x;
if (pos_x + 1 < (uint32_t)width) uiRight = sig_coeff_group_flag[position + 1];
if (pos_y + 1 < (uint32_t)width) uiLower = sig_coeff_group_flag[position + width];
if (pos_y + 1 < (uint32_t)height) uiLower = sig_coeff_group_flag[position + width];
return uiRight || uiLower;
}
@ -656,7 +677,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag,
* \returns context index for current scan position
*/
uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t height, uint32_t width, int8_t type,
uint32_t width, uint32_t height, int8_t color,
int32_t* temp_diag, int32_t* temp_sum)
{
const coeff_t* data = coeff + pos_x + pos_y * width;
@ -686,7 +707,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u
}
#undef UPDATE
int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0);
if (type == 0 /* Luma */)
if (color == COLOR_Y)
{
ctx_ofs += diag < 5 ? 4 : 0;
}
@ -814,7 +835,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos
* \returns context go rice parameter
*/
uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t height, uint32_t width, uint32_t baselevel)
uint32_t width, uint32_t height, uint32_t baselevel)
{
#define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/
@ -856,8 +877,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
* \returns context go rice parameter
*/
uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t height, uint32_t width, uint32_t baselevel)
uint32_t width, uint32_t height, uint32_t baselevel)
{
uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel);
uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel);
return g_go_rice_pars[check];
}

View file

@ -49,10 +49,10 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice);
void uvg_context_copy(encoder_state_t * target_state, const encoder_state_t * source_state);
uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width);
uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width, int32_t height);
uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, int32_t width);
uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t height, uint32_t width, int8_t type,
uint32_t width, uint32_t height, int8_t type,
int32_t* temp_diag, int32_t* temp_sum);
uint32_t uvg_context_get_sig_ctx_idx_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos_y,
@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t height, uint32_t width, uint32_t baselevel);
uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t height, uint32_t width, uint32_t baselevel);
uint32_t width, uint32_t height, uint32_t baselevel);
#define CNU 35
#define DWS 8

255
src/cu.c
View file

@ -34,6 +34,9 @@
#include <stdlib.h>
#include "cu.h"
#include "alf.h"
#include "encoderstate.h"
#include "threads.h"
@ -97,6 +100,42 @@ cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
}
void uvg_get_isp_cu_arr_coords(int *x, int *y, int dim)
{
// Do nothing if dimensions are divisible by 4
if (*y % 4 == 0 && *x % 4 == 0) return;
const int remainder_y = *y % 4;
const int remainder_x = *x % 4;
if (remainder_y != 0) {
// Horizontal ISP split
if (remainder_y % 2 == 0 && dim == 8) {
// 8x2 block
*y -= 2;
*x += 4;
}
else {
// 16x1 block
*y -= remainder_y;
*x += remainder_y * 4;
}
}
else {
// Vertical ISP split
if (*x % 2 == 0 && dim == 8) {
// 2x8 block
*y += 4;
*x -= 2;
}
else {
// 1x16 block
*y += remainder_x * 4;
*x -= remainder_x;
}
}
}
const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px)
{
assert(x_px < cua->width);
@ -237,10 +276,10 @@ cu_array_t * uvg_cu_array_copy_ref(cu_array_t* cua)
* \param dst_y y-coordinate of the top edge of the copied area in dst
* \param src source lcu
*/
void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type tree_type)
void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src)
{
const int dst_stride = dst->stride >> 2;
const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C;
const int width = LCU_WIDTH;
for (int y = 0; y < width; y += SCU_WIDTH) {
for (int x = 0; x < width; x += SCU_WIDTH) {
const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y);
@ -251,3 +290,215 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
}
}
}
/*
* \brief Constructs cu_loc_t based on given parameters. Calculates chroma dimensions automatically.
*
* \param loc Destination cu_loc.
* \param x Block top left x coordinate.
* \param y Block top left y coordinate.
* \param width Block width.
* \param height Block height.
*/
void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height)
{
assert(x >= 0 && y >= 0 && width >= 0 && height >= 0 && "Cannot give negative coordinates or block dimensions.");
assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Luma CU dimension exceeds maximum (dim > LCU_WIDTH).");
// This check is no longer valid. With non-square blocks and ISP enabled, even 1x16 and 16x1 (ISP needs at least 16 samples) blocks are valid
//assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4.");
loc->x = x;
loc->y = y;
loc->local_x = x % LCU_WIDTH;
loc->local_y = y % LCU_WIDTH;
loc->width = width;
loc->height = height;
// TODO: when MTT is implemented, chroma dimensions can be minimum 2.
// Chroma width is half of luma width, when not at maximum depth.
loc->chroma_width = width >> 1;
loc->chroma_height = height >> 1;
}
int uvg_get_split_locs(
const cu_loc_t* const origin,
enum split_type split,
cu_loc_t out[4],
uint8_t* separate_chroma)
{
const int half_width = origin->width >> 1;
const int half_height = origin->height >> 1;
const int quarter_width = origin->width >> 2;
const int quarter_height = origin->height >> 2;
if (origin->width == 4 && separate_chroma) *separate_chroma = 1;
switch (split) {
case NO_SPLIT:
assert(0 && "trying to get split from no split");
break;
case QT_SPLIT:
uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, half_height);
uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height);
uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height);
uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height);
if (half_height == 4 && separate_chroma) *separate_chroma = 1;
return 4;
case BT_HOR_SPLIT:
uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height);
uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height);
if (half_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
return 2;
case BT_VER_SPLIT:
uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height);
uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height);
if ((half_width == 4 || half_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
return 2;
case TT_HOR_SPLIT:
uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height);
uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height);
uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height);
if (quarter_height * origin->width < 64 && separate_chroma) *separate_chroma = 1;
return 3;
case TT_VER_SPLIT:
uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height);
uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height);
uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height);
if ((quarter_width == 4 || quarter_width * origin->height < 64) && separate_chroma) *separate_chroma = 1;
return 3;
}
return 0;
}
int uvg_get_implicit_split(
const encoder_state_t* const state,
const cu_loc_t* const cu_loc,
uint8_t max_mtt_depth)
{
bool right_ok = (state->tile->frame->width) >= cu_loc->x + cu_loc->width;
bool bottom_ok = (state->tile->frame->height) >= cu_loc->y + cu_loc->height;
if (right_ok && bottom_ok) return NO_SPLIT;
if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT;
if (bottom_ok && max_mtt_depth != 0) return BT_VER_SPLIT;
return QT_SPLIT;
}
int uvg_get_possible_splits(const encoder_state_t * const state,
const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6])
{
const unsigned width = cu_loc->width;
const unsigned height = cu_loc->height;
const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
const unsigned max_btd =
state->encoder_control->cfg.max_btt_depth[slice_type] + split_tree.implicit_mtt_depth;
const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type];
const unsigned min_bt_size = 1 << MIN_SIZE;
const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type];
const unsigned min_tt_size = 1 << MIN_SIZE;
const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type];
const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd);
splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true;
bool can_btt = split_tree.mtt_depth < max_btd;
const enum split_type last_split = GET_SPLITDATA(&split_tree, split_tree.current_depth - 1);
const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT;
// don't allow QT-splitting below a BT split
if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false;
if (width <= min_qt_size) splits[QT_SPLIT] = false;
if (tree_type == UVG_CHROMA_T && width <= 8) splits[QT_SPLIT] = false;
if (implicitSplit != NO_SPLIT)
{
splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false;
splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT && height <= max_bt_size;
splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT && width <= max_bt_size;
if (tree_type == UVG_CHROMA_T && width <= 8) splits[BT_VER_SPLIT] = false;
if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true;
return 1;
}
if ((last_split == TT_HOR_SPLIT || last_split == TT_VER_SPLIT) && split_tree.part_index == 1)
{
splits[BT_HOR_SPLIT] = parl_split != BT_HOR_SPLIT;
splits[BT_VER_SPLIT] = parl_split != BT_VER_SPLIT;
}
if (can_btt && (width <= min_bt_size && height <= min_bt_size)
&& ((width <= min_tt_size && height <= min_tt_size)))
{
can_btt = false;
}
if (can_btt && (width > max_bt_size || height > max_bt_size)
&& ((width > max_tt_size || height > max_tt_size)))
{
can_btt = false;
}
if (!can_btt)
{
splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false;
return 0;
}
if (width > max_bt_size || height > max_bt_size)
{
splits[BT_HOR_SPLIT] = splits[BT_VER_SPLIT] = false;
}
// specific check for BT splits
if (height <= min_bt_size) splits[BT_HOR_SPLIT] = false;
if (width > 64 && height <= 64) splits[BT_HOR_SPLIT] = false;
if (tree_type == UVG_CHROMA_T && width * height <= 64) splits[BT_HOR_SPLIT] = false;
if (width <= min_bt_size) splits[BT_VER_SPLIT] = false;
if (width <= 64 && height > 64) splits[BT_VER_SPLIT] = false;
if (tree_type == UVG_CHROMA_T && (width * height <= 64 || width <= 8)) splits[BT_VER_SPLIT] = false;
//if (modeType == MODE_TYPE_INTER && width * height == 32) splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false;
if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size)
splits[TT_HOR_SPLIT] = false;
if (width > 64 || height > 64) splits[TT_HOR_SPLIT] = false;
if (tree_type == UVG_CHROMA_T && width * height <= 64 * 2) splits[TT_HOR_SPLIT] = false;
if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size)
splits[TT_VER_SPLIT] = false;
if (width > 64 || height > 64) splits[TT_VER_SPLIT] = false;
if (tree_type == UVG_CHROMA_T && (width * height <= 64 * 2 || width <= 16)) splits[TT_VER_SPLIT] = false;
//if (modeType == MODE_TYPE_INTER && width * height == 64) splits[TT_VER_SPLIT] = splits[TT_HOR_SPLIT] = false;
return 0;
}
int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left)
{
if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) {
return 0;
}
if (left && cu_loc->local_x == 0) return (LCU_WIDTH - cu_loc->local_y) / 4;
if (!left && cu_loc->local_y == 0) return (cu_loc->width) / 2;
int amount = left ? cu_loc->height & ~3 : cu_loc->width & ~3;
if(left) {
const cu_info_t* cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);
if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu->log2_height == 6 && cu->log2_width == 6) return 8;
while (cu_loc->local_y + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET) {
amount += TR_MIN_WIDTH;
}
return MAX(amount / TR_MIN_WIDTH, cu_loc->height / TR_MIN_WIDTH);
}
while (cu_loc->local_x + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) {
amount += TR_MIN_WIDTH;
}
return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH);
}

167
src/cu.h
View file

@ -77,55 +77,6 @@ typedef enum {
MTS_TR_NUM = 6,
} mts_idx;
extern const uint8_t uvg_part_mode_num_parts[];
extern const uint8_t uvg_part_mode_offsets[][4][2];
extern const uint8_t uvg_part_mode_sizes[][4][2];
/**
* \brief Get the x coordinate of a PU.
*
* \param part_mode partition mode of the containing CU
* \param cu_width width of the containing CU
* \param cu_x x coordinate of the containing CU
* \param i number of the PU
* \return location of the left edge of the PU
*/
#define PU_GET_X(part_mode, cu_width, cu_x, i) \
((cu_x) + uvg_part_mode_offsets[(part_mode)][(i)][0] * (cu_width) / 4)
/**
* \brief Get the y coordinate of a PU.
*
* \param part_mode partition mode of the containing CU
* \param cu_width width of the containing CU
* \param cu_y y coordinate of the containing CU
* \param i number of the PU
* \return location of the top edge of the PU
*/
#define PU_GET_Y(part_mode, cu_width, cu_y, i) \
((cu_y) + uvg_part_mode_offsets[(part_mode)][(i)][1] * (cu_width) / 4)
/**
* \brief Get the width of a PU.
*
* \param part_mode partition mode of the containing CU
* \param cu_width width of the containing CU
* \param i number of the PU
* \return width of the PU
*/
#define PU_GET_W(part_mode, cu_width, i) \
(uvg_part_mode_sizes[(part_mode)][(i)][0] * (cu_width) / 4)
/**
* \brief Get the height of a PU.
*
* \param part_mode partition mode of the containing CU
* \param cu_width width of the containing CU
* \param i number of the PU
* \return height of the PU
*/
#define PU_GET_H(part_mode, cu_width, i) \
(uvg_part_mode_sizes[(part_mode)][(i)][1] * (cu_width) / 4)
//////////////////////////////////////////////////////////////////////////
// TYPES
@ -142,24 +93,53 @@ enum uvg_tree_type {
UVG_CHROMA_T = 2
};
enum split_type {
NO_SPLIT = 0,
QT_SPLIT = 1,
BT_HOR_SPLIT = 2,
BT_VER_SPLIT = 3,
TT_HOR_SPLIT = 4,
TT_VER_SPLIT = 5,
};
typedef struct {
uint32_t split_tree;
uint8_t current_depth;
uint8_t mtt_depth;
uint8_t implicit_mtt_depth;
uint8_t part_index;
} split_tree_t;
// Split for each depth takes three bits like xxy where if either x bit is set
// it is a MTT split, and if there are any MTT split QT split is not allowed
#define CAN_QT_SPLIT(x) (((x) & 6DB6DB6) == 0)
/**
* \brief Struct for CU info
*/
typedef struct
{
uint8_t type : 3; //!< \brief block type, one of cu_type_t values
uint8_t depth : 3; //!< \brief depth / size of this block
uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values
uint8_t tr_depth : 3; //!< \brief transform depth
uint8_t skipped : 1; //!< \brief flag to indicate this block is skipped
uint8_t merged : 1; //!< \brief flag to indicate this block is merged
uint8_t merge_idx : 3; //!< \brief merge index
uint8_t tr_skip : 3; //!< \brief transform skip flag
uint8_t tr_idx : 3; //!< \brief transform index
uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding
uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding
uint8_t log2_width : 3;
uint8_t log2_height : 3;
uint8_t log2_chroma_width : 3;
uint8_t log2_chroma_height : 3;
uint16_t cbf;
uint8_t root_cbf;
uint32_t split_tree : 3 * 9;
/**
* \brief QP used for the CU.
*
@ -172,12 +152,15 @@ typedef struct
uint8_t violates_mts_coeff_constraint : 1;
uint8_t mts_last_scan_pos : 1;
uint8_t violates_lfnst_constrained_luma : 1; // Two types, luma and chroma. Luma index is 0.
uint8_t violates_lfnst_constrained_chroma : 1; // Two types, luma and chroma. Luma index is 0.
uint8_t violates_lfnst_constrained_luma : 1;
uint8_t violates_lfnst_constrained_chroma : 1;
uint8_t lfnst_last_scan_pos : 1;
uint8_t lfnst_idx : 2;
uint8_t cr_lfnst_idx : 2;
uint8_t luma_deblocking : 2;
uint8_t chroma_deblocking : 2;
union {
struct {
int8_t mode;
@ -185,6 +168,9 @@ typedef struct
uint8_t multi_ref_idx;
int8_t mip_flag;
int8_t mip_is_transposed;
int8_t isp_mode;
uint8_t isp_cbfs : 4;
uint8_t isp_index : 2;
} intra;
struct {
mv_t mv[2][2]; // \brief Motion vectors for L0 and L1
@ -200,12 +186,25 @@ typedef struct
typedef struct {
int16_t x;
int16_t y;
uint8_t local_x;
uint8_t local_y;
int8_t width;
int8_t height;
int8_t chroma_width;
int8_t chroma_height;
} cu_loc_t;
void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height);
typedef struct encoder_state_t encoder_state_t;
int uvg_get_split_locs(
const cu_loc_t* const origin,
enum split_type split,
cu_loc_t out[4],
uint8_t* separate_chroma);
int uvg_get_possible_splits(const encoder_state_t* const state,
const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]);
#define CU_GET_MV_CAND(cu_info_ptr, reflist) \
(((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
@ -219,7 +218,7 @@ typedef struct {
} \
} while (0)
#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d part_size=%d coded=%d " \
"skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
"intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
"intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \
@ -227,7 +226,7 @@ typedef struct {
"intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \
"inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \
"inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \
, (cu).type, (cu).depth, (cu).part_size, (cu).tr_depth, (cu).coded, \
, (cu).type, (cu).part_size, (cu).coded, \
(cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \
(cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \
(cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \
@ -246,6 +245,7 @@ typedef struct cu_array_t {
} cu_array_t;
cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
void uvg_get_isp_cu_arr_coords(int* x, int* y, int dim);
const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
cu_array_t * uvg_cu_array_alloc(const int width, const int height);
@ -382,8 +382,9 @@ typedef struct {
cu_info_t cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1];
} lcu_t;
void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type
tree_type);
void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src);
int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left);
/**
* \brief Return pointer to the top right reference CU.
@ -412,9 +413,11 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu
*/
static INLINE void copy_coeffs(const coeff_t *__restrict src,
coeff_t *__restrict dest,
size_t width)
size_t width, size_t height, const int lcu_width)
{
memcpy(dest, src, width * width * sizeof(coeff_t));
for (int j = 0; j < height; ++j) {
memcpy(dest + j * lcu_width, src + j * lcu_width, width * sizeof(coeff_t));
}
}
@ -554,56 +557,52 @@ static INLINE unsigned xy_to_zorder(unsigned width, unsigned x, unsigned y)
} while(0)
#define NUM_CBF_DEPTHS 5
static const uint16_t cbf_masks[NUM_CBF_DEPTHS] = { 0x1f, 0x0f, 0x07, 0x03, 0x1 };
/**
* Check if CBF in a given level >= depth is true.
*/
static INLINE int cbf_is_set(uint16_t cbf, int depth, color_t plane)
static INLINE int cbf_is_set(uint16_t cbf, color_t plane)
{
return (cbf & (cbf_masks[depth] << (NUM_CBF_DEPTHS * plane))) != 0;
return (cbf & (1 << (plane))) != 0;
}
/**
* Check if CBF in a given level >= depth is true.
*/
static INLINE int cbf_is_set_any(uint16_t cbf, int depth)
static INLINE int cbf_is_set_any(uint16_t cbf)
{
return cbf_is_set(cbf, depth, COLOR_Y) ||
cbf_is_set(cbf, depth, COLOR_U) ||
cbf_is_set(cbf, depth, COLOR_V);
return cbf_is_set(cbf, COLOR_Y) ||
cbf_is_set(cbf, COLOR_U) ||
cbf_is_set(cbf, COLOR_V);
}
/**
* Set CBF in a level to true.
*/
static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane)
static INLINE void cbf_set(uint16_t *cbf, color_t plane)
{
// Return value of the bit corresponding to the level.
*cbf |= (0x10 >> depth) << (NUM_CBF_DEPTHS * plane);
*cbf |= (1) << (plane);
}
/**
* Set CBF in a level to true if it is set at a lower level in any of
* the child_cbfs.
*/
static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], int depth, color_t plane)
static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], color_t plane)
{
bool child_cbf_set = cbf_is_set(child_cbfs[0], depth + 1, plane) ||
cbf_is_set(child_cbfs[1], depth + 1, plane) ||
cbf_is_set(child_cbfs[2], depth + 1, plane);
bool child_cbf_set = cbf_is_set(child_cbfs[0], plane) ||
cbf_is_set(child_cbfs[1], plane) ||
cbf_is_set(child_cbfs[2], plane);
if (child_cbf_set) {
cbf_set(cbf, depth, plane);
cbf_set(cbf, plane);
}
}
/**
* Set CBF in a levels <= depth to false.
*/
static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
static INLINE void cbf_clear(uint16_t *cbf, color_t plane)
{
*cbf &= ~(cbf_masks[depth] << (NUM_CBF_DEPTHS * plane));
*cbf &= ~(1 << (plane));
}
/**
@ -611,11 +610,11 @@ static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
*/
static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
{
cbf_clear(cbf, 0, plane);
*cbf |= src & (cbf_masks[0] << (NUM_CBF_DEPTHS * plane));
cbf_clear(cbf, plane);
*cbf |= src & (1 << plane);
}
#define GET_SPLITDATA(CU,curDepth) ((CU)->depth > curDepth)
#define SET_SPLITDATA(CU,flag) { (CU)->split=(flag); }
#define GET_SPLITDATA(CU,curDepth) ((CU)->split_tree >> ((MAX((curDepth), 0) * 3)) & 7)
#define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE)
#endif

1139
src/dep_quant.c Normal file

File diff suppressed because it is too large Load diff

247
src/dep_quant.h Normal file
View file

@ -0,0 +1,247 @@
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
#ifndef DEP_QUANT_H_
#define DEP_QUANT_H_
#include "cu.h"
#include "global.h"
#define SM_NUM_CTX_SETS_SIG 3
#define SM_NUM_CTX_SETS_GTX 2
#define SM_MAX_NUM_SIG_SBB_CTX 2
#define SM_MAX_NUM_SIG_CTX 12
#define SM_MAX_NUM_GTX_CTX 21
#define SCALE_BITS 15
#define RICEMAX 32
typedef struct encoder_control_t encoder_control_t;
enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
struct dep_quant_scan_info
{
uint8_t sig_ctx_offset[2];
uint8_t gtx_ctx_offset[2];
uint16_t cg_pos;
uint16_t pos_y;
uint16_t pos_x;
uint8_t next_sbb_right;
uint8_t next_sbb_below;
};
typedef struct
{
int m_QShift;
int64_t m_QAdd;
int64_t m_QScale;
int64_t m_maxQIdx;
int64_t m_thresLast;
int64_t m_thresSSbb;
// distortion normalization
int m_DistShift;
int64_t m_DistAdd;
int64_t m_DistStepAdd;
int64_t m_DistOrgFact;
bool needs_init;
} quant_block;
typedef struct
{
int32_t m_lastBitsX[TR_MAX_WIDTH];
int32_t m_lastBitsY[TR_MAX_WIDTH];
uint32_t m_sigSbbFracBits[SM_MAX_NUM_SIG_SBB_CTX][2];
uint32_t m_sigFracBits[SM_NUM_CTX_SETS_SIG][SM_MAX_NUM_SIG_CTX][2];
int32_t m_gtxFracBits[SM_MAX_NUM_GTX_CTX][6];
bool needs_init;
} rate_estimator_t;
typedef struct
{
uint8_t num;
uint8_t inPos[5];
} NbInfoSbb;
typedef struct
{
uint16_t maxDist;
uint16_t num;
uint16_t outPos[5];
} NbInfoOut;
typedef struct {
int32_t absLevel[4];
int64_t deltaDist[4];
} PQData;
typedef struct {
int64_t ALIGNED(32) rdCost[8];
int32_t ALIGNED(32) absLevel[8];
int32_t ALIGNED(32) prevId[8];
} Decision;
typedef struct {
uint8_t* sbbFlags;
uint8_t* levels;
} SbbCtx;
typedef struct {
const NbInfoOut* m_nbInfo;
uint32_t m_sbbFlagBits[2][2];
SbbCtx m_allSbbCtx[2];
int m_curr_sbb_ctx_offset;
int m_prev_sbb_ctx_offset;
uint8_t sbb_memory[8 * 1024];
uint8_t level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH];
int num_coeff;
} common_context;
typedef struct {
int64_t m_rdCost;
uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
int8_t m_numSigSbb;
int m_remRegBins;
int8_t m_refSbbCtxId;
uint32_t m_sbbFracBits[2];
uint32_t m_sigFracBits[2];
int32_t m_coeffFracBits[6];
int8_t m_goRicePar;
int8_t m_goRiceZero;
int8_t m_stateId;
uint32_t* m_sigFracBitsArray[12];
int32_t* m_gtxFracBitsArray[21];
common_context* m_commonCtx;
unsigned effWidth;
unsigned effHeight;
} depquant_state;
typedef struct {
int64_t ALIGNED(32) m_rdCost[12];
uint8_t ALIGNED(32) m_absLevels[3][16 * 4];
uint16_t ALIGNED(32) m_ctxInit[3][16 * 4];
int8_t ALIGNED(16) m_numSigSbb[12];
int ALIGNED(32) m_remRegBins[12];
int8_t ALIGNED(16) m_refSbbCtxId[12];
uint32_t ALIGNED(32) m_sbbFracBits[12][2];
uint32_t ALIGNED(32) m_sigFracBits[12][2];
int32_t ALIGNED(32) m_coeffFracBits[12][6];
int8_t ALIGNED(16) m_goRicePar[12];
int8_t ALIGNED(16) m_goRiceZero[12];
int8_t ALIGNED(16) m_stateId[12];
uint32_t ALIGNED(32) m_sigFracBitsArray[12][12][2];
int32_t ALIGNED(32) m_gtxFracBitsArray[21][6];
common_context* m_commonCtx;
unsigned effWidth;
unsigned effHeight;
bool all_gte_four;
bool all_lt_four;
} all_depquant_states;
typedef struct {
common_context m_common_context;
all_depquant_states m_allStates;
int m_curr_state_offset;
int m_prev_state_offset;
int m_skip_state_offset;
depquant_state m_startState;
quant_block* m_quant;
Decision m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
} context_store;
int uvg_init_nb_info(encoder_control_t* encoder);
void uvg_dealloc_nb_info(encoder_control_t* encoder);
void uvg_dep_quant_dequant(
const encoder_state_t* const state,
const int block_type,
const int width,
const int height,
const color_t compID,
coeff_t* quant_coeff,
coeff_t* coeff,
bool enableScalingLists);
int uvg_dep_quant(
const encoder_state_t* const state,
const cu_info_t* const cur_tu,
const int width,
const int height,
const coeff_t* srcCoeff,
coeff_t* coeff_out,
const color_t compID,
enum uvg_tree_type tree_type,
int* absSum,
const bool enableScalingLists);
void uvg_dep_quant_update_state(
context_store* ctxs,
int numIPos,
const uint32_t scan_pos,
const Decision* decisions,
const uint32_t sigCtxOffsetNext,
const uint32_t gtxCtxOffsetNext,
const NbInfoSbb next_nb_info_ssb,
const int baseLevel,
const bool extRiceFlag,
int decision_id);
void uvg_dep_quant_update_state_eos(
context_store* ctxs,
const uint32_t scan_pos,
const uint32_t cg_pos,
const uint32_t sigCtxOffsetNext,
const uint32_t gtxCtxOffsetNext,
const uint32_t width_in_sbb,
const uint32_t height_in_sbb,
const uint32_t next_sbb_right,
const uint32_t next_sbb_below,
const Decision* decisions,
int decision_id);
void uvg_dep_quant_check_rd_costs(
const all_depquant_states* const state,
const enum ScanPosType spt,
const PQData* pqDataA,
Decision* decisions,
const int decisionA,
const int decisionB,
const int state_offset);
#endif

File diff suppressed because it is too large Load diff

View file

@ -40,30 +40,29 @@
#include "encoderstate.h"
#include "global.h"
bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu);
bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t*
const cu_loc);
bool uvg_is_lfnst_allowed(
const encoder_state_t* const state,
const cu_info_t* const pred_cu,
const int width,
const int height,
const int x,
const int y,
enum uvg_tree_type tree_type,
const color_t color,
const lcu_t* lcu);
const cu_loc_t* const cu_loc, const lcu_t* const lcu);
void uvg_encode_coding_tree(
encoder_state_t * const state,
uint16_t x_ctb,
uint16_t y_ctb,
uint8_t depth,
lcu_coeff_t *coeff,
enum uvg_tree_type tree_type);
enum uvg_tree_type tree_type,
const cu_loc_t* const cu_loc,
const cu_loc_t* const chroma_loc,
split_tree_t split_tree,
bool has_chroma);
void uvg_encode_ts_residual(encoder_state_t* const state,
cabac_data_t* const cabac,
const coeff_t* coeff,
uint32_t width,
uint32_t height,
uint8_t type,
int8_t scan_mode,
double* bits);
@ -77,41 +76,47 @@ void uvg_encode_mvd(encoder_state_t * const state,
double uvg_mock_encode_coding_unit(
encoder_state_t* const state,
cabac_data_t* cabac,
int x,
int y,
int depth,
const cu_loc_t* const cu_loc,
const cu_loc_t* const chroma_loc,
lcu_t* lcu,
cu_info_t* cur_cu,
enum uvg_tree_type tree_type);
enum uvg_tree_type tree_type,
const split_tree_t split_tree);
int uvg_encode_inter_prediction_unit(encoder_state_t* const state,
int uvg_encode_inter_prediction_unit(
encoder_state_t* const state,
cabac_data_t* const cabac,
const cu_info_t* const cur_cu,
int x, int y, int width, int height,
int depth,
lcu_t* lcu,
double* bits_out,
const cu_loc_t* const cu_loc);
void uvg_encode_intra_luma_coding_unit(
const encoder_state_t* const state,
cabac_data_t* const cabac,
const cu_info_t* const cur_cu,
const cu_loc_t* const cu_loc,
const lcu_t* lcu,
double* bits_out);
void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
cabac_data_t* const cabac,
const cu_info_t* const cur_cu,
int x, int y, int depth, const lcu_t* lcu, double* bits_out);
bool uvg_write_split_flag(
uint8_t uvg_write_split_flag(
const encoder_state_t* const state,
cabac_data_t* cabac,
const cu_info_t* left_cu,
const cu_info_t* above_cu,
uint8_t split_flag,
int depth,
int cu_width,
int x,
int y,
const cu_loc_t* const cu_loc,
split_tree_t,
enum uvg_tree_type tree_type,
bool* is_implicit_out,
double* bits_out);
void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
uint8_t lastpos_x, uint8_t lastpos_y,
uint8_t width, uint8_t height,
uint8_t type, uint8_t scan, double* bits_out);
void uvg_get_sub_coeff(const coeff_t* dst, const coeff_t* const src,
const int lcu_x, const int lcu_y,
const int block_w, const int block_h,
const int lcu_width);

View file

@ -320,6 +320,13 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
encoder->scaling_list.use_default_list = 1;
}
if(cfg->dep_quant) {
if(!uvg_init_nb_info(encoder)) {
fprintf(stderr, "Could not initialize nb info.\n");
goto init_failed;
}
}
// ROI / delta QP
if (cfg->roi.file_path) {
const char *mode[2] = { "r", "rb" };
@ -379,10 +386,6 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
goto init_failed;
}
// NOTE: When tr_depth_inter is equal to 0, the transform is still split
// for SMP and AMP partition units.
encoder->tr_depth_inter = 0;
//Tiles
encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
encoder->cfg.tiles_height_count > 1;

View file

@ -38,6 +38,7 @@
* Initialization of encoder_control_t.
*/
#include "dep_quant.h"
#include "global.h" // IWYU pragma: keep
#include "uvg266.h"
#include "scalinglist.h"
@ -98,6 +99,10 @@ typedef struct encoder_control_t
//scaling list
scaling_list_t scaling_list;
NbInfoSbb* m_scanId2NbInfoSbbArray[7 + 1][7 + 1];
NbInfoOut* m_scanId2NbInfoOutArray[7 + 1][7 + 1];
struct dep_quant_scan_info* scan_info[7 + 1][7 + 1];
//spec: references to variables defined in Rec. ITU-T H.265 (04/2013)
int8_t tiles_enable; /*!<spec: tiles_enabled */
@ -132,8 +137,6 @@ typedef struct encoder_control_t
FILE *roi_file;
int tr_depth_inter;
//! pic_parameter_set
struct {
uint8_t dependent_slice_segments_enabled_flag;

View file

@ -528,48 +528,31 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
WRITE_UE(stream, MIN_SIZE-2, "log2_min_luma_coding_block_size_minus2"); // Min size 2^3 = 8x8
// if(!no_partition_constraints_override_constraint_flag)
WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_luma");
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma");
if (encoder->cfg.max_btt_depth[0]) {
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
}
if (encoder->chroma_format != UVG_CSP_400)
{
WRITE_U(stream, encoder->cfg.dual_tree, 1, "qtbtt_dual_tree_intra_flag");
}
if (encoder->cfg.dual_tree) {
WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
if (0 /*sps_max_mtt_hierarchy_depth_intra_slice_chroma != 0*/) {
WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
if (encoder->cfg.max_btt_depth[2]) {
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
}
}
WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_inter_slice");
WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_inter_slice");
#if 0 // mtt depth intra
if (max_mtt_depth_intra != 0) {
WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_luma");
WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_luma");
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice");
if (encoder->cfg.max_btt_depth[1] != 0) {
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
}
#endif
#if 0 // mtt depth inter
if (max_mtt_depth_inter != 0) {
WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_inter_tile_group");
WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_inter_tile_group");
}
#endif
#if 0 // Dual Tree
if (encoder->cfg.dual_i_tree) {
WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_tile_group_chroma");
WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_tile_group_chroma");
if (max_mtt_depth_intra != 0) {
WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_chroma");
WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_chroma");
}
}
#endif
if (LCU_WIDTH > 32)
WRITE_U(stream, (TR_MAX_LOG2_SIZE - 5) ? 1 : 0, 1, "sps_max_luma_transform_size_64_flag");
@ -665,7 +648,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
WRITE_UE(stream, encoder->cfg.log2_parallel_merge_level-2, "log2_parallel_merge_level_minus2");
WRITE_U(stream, 0, 1, "sps_isp_enabled_flag");
WRITE_U(stream, encoder->cfg.isp, 1, "sps_isp_enabled_flag");
if (state->encoder_control->cfg.mrl) {
WRITE_U(stream, 1, 1, "sps_mrl_enabled_flag");
@ -706,7 +689,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
WRITE_U(stream, 0, 1, "scaling_list_enabled_flag");
WRITE_U(stream, 0, 1, "pic_dep_quant_enabled_flag");
WRITE_U(stream, encoder->cfg.dep_quant, 1, "pic_dep_quant_enabled_flag");
WRITE_U(stream, encoder->cfg.signhide_enable, 1, "pic_sign_data_hiding_enabled_flag");
@ -1142,7 +1125,7 @@ static void uvg_encoder_state_write_bitstream_picture_header(
WRITE_U(stream, 0, 1, "ph_mvd_l1_zero_flag");
}
if (encoder->cfg.jccr) {
if (encoder->cfg.jccr && encoder->chroma_format != UVG_CSP_400) {
WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag");
}
// END PICTURE HEADER
@ -1375,11 +1358,14 @@ void uvg_encoder_state_write_bitstream_slice_header(
}
// ToDo: depquant
if (encoder->cfg.dep_quant) {
WRITE_U(stream, 1, 1, "sh_dep_quant_used_flag");
}
if (state->encoder_control->cfg.signhide_enable) {
if (state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant) {
WRITE_U(stream, 1, 1, "sh_sign_data_hiding_used_flag");
}
if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable /* && !cfg.dep_quant*/)
if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant)
{
// TODO: find out what this is actually about and parametrize it
WRITE_U(stream, 0, 1, "sh_ts_residual_coding_disabled_flag");

View file

@ -627,43 +627,52 @@ static void encode_sao(encoder_state_t * const state,
* \param prev_qp -1 if QP delta has not been coded in current QG,
* otherwise the QP of the current QG
*/
static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const
int depth)
{
// Stop recursion if the CU is completely outside the frame.
if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;
cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
const int cu_width = LCU_WIDTH >> depth;
cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
const int width = 1 << cu->log2_width;
if (depth <= state->frame->max_qp_delta_depth) {
*prev_qp = -1;
}
if (cu->depth > depth) {
if (cu_loc->width > width) {
// Recursively process sub-CUs.
const int d = cu_width >> 1;
set_cu_qps(state, x, y, depth + 1, last_qp, prev_qp);
set_cu_qps(state, x + d, y, depth + 1, last_qp, prev_qp);
set_cu_qps(state, x, y + d, depth + 1, last_qp, prev_qp);
set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
const int half_width = cu_loc->width >> 1;
const int half_height = cu_loc->height >> 1;
cu_loc_t split_cu_loc;
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);
} else {
bool cbf_found = *prev_qp >= 0;
if (cu->tr_depth > depth) {
int y_limit = cu_loc->y + cu_loc->height;
int x_limit = cu_loc->x + cu_loc->width;
if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
// The CU is split into smaller transform units. Check whether coded
// block flag is set for any of the TUs.
const int tu_width = LCU_WIDTH >> cu->tr_depth;
for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
const int tu_width = MIN(TR_MAX_WIDTH, 1 << cu->log2_width);
for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
if (cbf_is_set_any(tu->cbf, cu->depth)) {
if (cbf_is_set_any(tu->cbf)) {
cbf_found = true;
}
}
}
} else if (cbf_is_set_any(cu->cbf, cu->depth)) {
} else if (cbf_is_set_any(cu->cbf)) {
cbf_found = true;
}
@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
if (cbf_found) {
*prev_qp = qp = cu->qp;
} else {
qp = uvg_get_cu_ref_qp(state, x, y, *last_qp);
qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp);
}
// Set the correct QP for all state->tile->frame->cu_array elements in
// the area covered by the CU.
for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) {
for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) {
uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
}
}
if (is_last_cu_in_qg(state, x, y, depth)) {
if (is_last_cu_in_qg(state, cu_loc)) {
*last_qp = cu->qp;
}
}
@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
if (state->frame->max_qp_delta_depth >= 0) {
int last_qp = state->last_qp;
int prev_qp = -1;
set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
cu_loc_t cu_loc;
uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH);
set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0);
}
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) {
@ -870,10 +881,16 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
enum uvg_tree_type tree_type = state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T;
//Encode coding tree
uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff, tree_type);
cu_loc_t start;
uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
split_tree_t split_tree = { 0, 0, 0, 0, 0 };
uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true);
if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) {
uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, 0, lcu->coeff, UVG_CHROMA_T);
uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
cu_loc_t chroma_tree_loc = start;
uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &chroma_tree_loc, split_tree, true);
}
if (!state->cabac.only_count) {
@ -1152,6 +1169,12 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
uvg_threadqueue_submit(state->encoder_control->threadqueue, job[0]);
uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]);
#ifdef UVG_DEBUG_PRINT_CABAC
// Ensures that the ctus are encoded in raster scan order
if(i >= state->tile->frame->width_in_lcu) {
uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[(lcu->id / state->tile->frame->width_in_lcu - 1) * state->tile->frame->width_in_lcu]);
}
#endif
}
uvg_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
@ -1281,13 +1304,13 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
sub_state->tile->frame->height_in_lcu * LCU_WIDTH
);
if(main_state->encoder_control->cfg.dual_tree){
if(main_state->encoder_control->cfg.dual_tree && main_state->frame->is_irap){
sub_state->tile->frame->chroma_cu_array = uvg_cu_subarray(
main_state->tile->frame->chroma_cu_array,
offset_x / 2,
offset_y / 2,
sub_state->tile->frame->width_in_lcu * LCU_WIDTH_C,
sub_state->tile->frame->height_in_lcu * LCU_WIDTH_C
offset_x,
offset_y,
sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
sub_state->tile->frame->height_in_lcu * LCU_WIDTH
);
}
}
@ -1926,10 +1949,9 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
if (cfg->dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 && state->frame->is_irap) {
assert(state->tile->frame->chroma_cu_array == NULL);
state->tile->frame->chroma_cu_array = uvg_cu_array_chroma_alloc(
state->tile->frame->width / 2,
state->tile->frame->height / 2,
state->encoder_control->chroma_format
state->tile->frame->chroma_cu_array = uvg_cu_array_alloc(
state->tile->frame->width,
state->tile->frame->height
);
}
// Set pictype.
@ -2029,9 +2051,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s
void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame)
{
#if UVG_DEBUG_PRINT_CABAC == 1
uvg_cabac_bins_count = 0;
// uvg_cabac_bins_count = 0;
if (state->frame->num == 0) uvg_cabac_bins_verbose = true;
else uvg_cabac_bins_verbose = false;
// else uvg_cabac_bins_verbose = false;
#endif
@ -2193,11 +2215,12 @@ int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
{
const cu_array_t *cua = state->tile->frame->cu_array;
// Quantization group width
const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
const int qg_width = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_width);
const int qg_height = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_height);
// Coordinates of the top-left corner of the quantization group
const int x_qg = x & ~(qg_width - 1);
const int y_qg = y & ~(qg_width - 1);
const int y_qg = y & ~(qg_height - 1);
if(x_qg == 0 && y_qg > 0 && y_qg % LCU_WIDTH == 0) {
return uvg_cu_array_at_const(cua, x_qg, y_qg - 1)->qp;
}

View file

@ -332,6 +332,7 @@ typedef struct encoder_state_t {
int8_t qp;
double c_lambda;
double chroma_weights[4];
/**
* \brief Whether a QP delta value must be coded for the current LCU.
@ -359,7 +360,15 @@ typedef struct encoder_state_t {
//Constraint structure
void * constraint;
// Since lfnst needs the collocated luma intra mode for
// dual tree if the chroma mode is cclm mode and getting all of
// the information that would be necessary to get the collocated
// luma mode in the lfnst functions, instead store the current
// collocated luma mode in the state.
int8_t collocated_luma_mode;
quant_block quant_blocks[3]; // luma, ISP, chroma
rate_estimator_t rate_estimator[4]; // luma, cb, cr, isp
} encoder_state_t;
void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame);
@ -401,14 +410,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
* \param depth depth in the CU tree
* \return true, if it's the last CU in its QG, otherwise false
*/
static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc)
{
if (state->frame->max_qp_delta_depth < 0) return false;
const int cu_width = LCU_WIDTH >> depth;
const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
const int right = x + cu_width;
const int bottom = y + cu_width;
const int right = cu_loc->x + cu_loc->width;
const int bottom = cu_loc->y + cu_loc->height;
return (right % qg_width == 0 || right >= state->tile->frame->width) &&
(bottom % qg_width == 0 || bottom >= state->tile->frame->height);
}

View file

@ -36,6 +36,7 @@
#include "cu.h"
#include "encoder.h"
#include "intra.h"
#include "uvg266.h"
#include "transform.h"
#include "videoframe.h"
@ -269,19 +270,19 @@ static bool is_tu_boundary(
int32_t x,
int32_t y,
edge_dir dir,
color_t color,
enum uvg_tree_type tree_type)
{
x >>= tree_type == UVG_CHROMA_T;
y >>= tree_type == UVG_CHROMA_T;
// if (x & 3 || y & 3) return false;
const cu_info_t *const scu =
uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y);
const int tu_width = LCU_WIDTH >> (scu->tr_depth + (tree_type == UVG_CHROMA_T));
if (dir == EDGE_HOR) {
return (y & (tu_width - 1)) == 0;
return color == COLOR_Y ? scu->luma_deblocking & EDGE_HOR :
scu->chroma_deblocking & EDGE_HOR;
} else {
return (x & (tu_width - 1)) == 0;
return color == COLOR_Y ? scu->luma_deblocking & EDGE_VER :
scu->chroma_deblocking & EDGE_VER;
}
}
@ -306,32 +307,6 @@ static bool is_pu_boundary(const encoder_state_t *const state,
it for now, in case some other tool requires it.
*/
return false;
//const cu_info_t *const scu =
// uvg_cu_array_at_const(state->tile->frame->cu_array, x, y);
//// Get the containing CU.
//const int32_t cu_width = LCU_WIDTH >> scu->depth;
//const int32_t x_cu = x & ~(cu_width - 1);
//const int32_t y_cu = y & ~(cu_width - 1);
//const cu_info_t *const cu =
// uvg_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu);
//const int num_pu = uvg_part_mode_num_parts[cu->part_size];
//for (int i = 0; i < num_pu; i++) {
// if (dir == EDGE_HOR) {
// int y_pu = PU_GET_Y(cu->part_size, cu_width, y_cu, i);
// if (y_pu == y) {
// return true;
// }
// } else {
// int x_pu = PU_GET_X(cu->part_size, cu_width, x_cu, i);
// if (x_pu == x) {
// return true;
// }
// }
//}
//return false;
}
@ -346,9 +321,9 @@ static bool is_pu_boundary(const encoder_state_t *const state,
static bool is_on_8x8_grid(int x, int y, edge_dir dir)
{
if (dir == EDGE_HOR) {
return (y & 7) == 0 && (x & 2) == 0;
return (y & 7) == 0;
} else {
return (x & 7) == 0 && (y & 2) == 0;
return (x & 7) == 0;
}
}
@ -628,10 +603,10 @@ static INLINE void get_max_filter_length(uint8_t *filt_len_P, uint8_t *filt_len_
bool transform_edge_4x4[2] = { false, false };
bool transform_edge_8x8[2] = { false, false };
if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, tree_type);
if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, tree_type);
if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, tree_type);
if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, tree_type);
if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, comp, tree_type);
if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, comp, tree_type);
if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, comp, tree_type);
if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, comp, tree_type);
if (comp == COLOR_Y) {
if (tu_size_P_side <= 4 || tu_size_Q_side <= 4){
@ -756,8 +731,8 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
cu_q = uvg_cu_array_at(frame->cu_array, x_coord, y);
}
bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
|| cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y);
bool nonzero_coeffs = cbf_is_set(cu_q->cbf, COLOR_Y)
|| cbf_is_set(cu_p->cbf, COLOR_Y);
// Filter strength
strength = 0;
@ -766,7 +741,6 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
}
else if (tu_boundary && nonzero_coeffs) {
// Non-zero residual/coeffs and transform boundary
// Neither CU is intra so tr_depth <= MAX_DEPTH.
strength = 1;
}
else if(cu_p->inter.mv_dir == 3 || cu_q->inter.mv_dir == 3 || state->frame->slicetype == UVG_SLICE_B) { // B-slice related checks. TODO: Need to account for cu_p being in another slice?
@ -854,18 +828,50 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
bool is_side_Q_large = false;
uint8_t max_filter_length_P = 0;
uint8_t max_filter_length_Q = 0;
const int cu_size = LCU_WIDTH >> cu_q->depth;
const int pu_part_idx = (y + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ?
1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
+ (x + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
: PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx)
: x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
const int cu_width = 1 << cu_q->log2_width;
const int cu_height = 1 << cu_q->log2_height;
const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
int tu_size_q_side = 0;
if (cu_q->type == CU_INTRA && cu_q->intra.isp_mode != ISP_MODE_NO_ISP) {
if (cu_q->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
tu_size_q_side = MAX(4, cu_width >> 2);
} else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
tu_size_q_side = MAX(4, cu_height >> 2);
} else {
tu_size_q_side = dir == EDGE_HOR ?
MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
}
} else {
tu_size_q_side = dir == EDGE_HOR ?
MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) :
MIN(1 << cu_q->log2_width, TR_MAX_WIDTH);
}
int tu_size_p_side = 0;
if (cu_p->type == CU_INTRA && cu_p->intra.isp_mode != ISP_MODE_NO_ISP) {
if (cu_p->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) {
tu_size_p_side = MAX(4, (1 << cu_p->log2_width) >> 2);
} else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) {
tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2);
} else {
tu_size_p_side = dir == EDGE_HOR ?
MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
}
} else {
tu_size_p_side = dir == EDGE_HOR ?
MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) :
MIN(1 << cu_p->log2_width, TR_MAX_WIDTH);
}
get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
dir, tu_boundary,
LCU_WIDTH >> cu_p->tr_depth,
LCU_WIDTH >> cu_q->tr_depth,
tu_size_p_side,
tu_size_q_side,
pu_pos, pu_size, cu_q->merged, COLOR_Y,
UVG_LUMA_T);
@ -1073,41 +1079,44 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
// CUs on both sides of the edge
cu_info_t *cu_p;
cu_info_t *cu_q;
int32_t x_coord = x << (tree_type != UVG_CHROMA_T);
int32_t y_coord = y << (tree_type != UVG_CHROMA_T);
int32_t x_coord = x << 1;
int32_t y_coord = y << 1;
cu_array_t* cua = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array;
if (dir == EDGE_VER) {
y_coord = (y + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
y_coord = (y + min_chroma_length * blk_idx) << (1);
cu_p = uvg_cu_array_at(cua, x_coord - 1, y_coord);
cu_q = uvg_cu_array_at(cua, x_coord , y_coord);
} else {
x_coord = (x + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T);
x_coord = (x + min_chroma_length * blk_idx) << (1);
cu_p = uvg_cu_array_at(cua, x_coord, y_coord - 1);
cu_q = uvg_cu_array_at(cua, x_coord, y_coord );
}
const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T));
const int pu_part_idx = ((y << (tree_type != UVG_CHROMA_T)) + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ?
1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0)
+ ((x << (tree_type != UVG_CHROMA_T)) + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0);
const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx)
: PU_GET_W(cu_q->part_size, cu_size, pu_part_idx);
const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx)
: x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx);
uint8_t max_filter_length_P = 0;
uint8_t max_filter_length_Q = 0;
const int tu_p_size = LCU_WIDTH >> (cu_p->tr_depth + (chroma_shift));
const int tu_q_size = LCU_WIDTH >> (cu_q->tr_depth + (chroma_shift));
const int cu_width = 1 << (cu_q->log2_chroma_width );
const int cu_height = 1 << (cu_q->log2_chroma_height);
const int pu_size = dir == EDGE_HOR ? cu_height : cu_width;
const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord;
const int tu_size_p_side = dir == EDGE_HOR ?
MIN(1 << (cu_p->log2_chroma_height), TR_MAX_WIDTH) :
MIN(1 << (cu_p->log2_chroma_width), TR_MAX_WIDTH);
const int tu_size_q_side = dir == EDGE_HOR ?
MIN(1 << (cu_q->log2_chroma_height ), TR_MAX_WIDTH) :
MIN(1 << (cu_q->log2_chroma_width ), TR_MAX_WIDTH);
get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
dir, tu_boundary, tu_p_size, tu_q_size,
dir, tu_boundary, tu_size_p_side, tu_size_q_side,
pu_pos, pu_size, cu_q->merged, COLOR_U,
tree_type);
const bool large_boundary = (max_filter_length_P >= 3 && max_filter_length_Q >= 3);
const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % (LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) == 0);
const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % LCU_WIDTH == 0);
uint8_t c_strength[2] = { 0, 0 };
@ -1116,10 +1125,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
c_strength[1] = 2;
}
else if (tu_boundary){ //TODO: Add ciip/IBC related stuff
bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_U)
|| cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_U);
bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_V)
|| cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_V);
bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, COLOR_U)
|| cbf_is_set(cu_p->cbf, COLOR_U);
bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, COLOR_V)
|| cbf_is_set(cu_p->cbf, COLOR_V);
c_strength[0] = nonzero_coeffs_U ? 1 : 0;
c_strength[1] = nonzero_coeffs_V ? 1 : 0;
}
@ -1238,10 +1247,11 @@ static void filter_deblock_unit(
const int32_t x_c = x >> 1;
const int32_t y_c = y >> 1;
if (state->encoder_control->chroma_format != UVG_CSP_400 &&
(is_on_8x8_grid(x_c, y_c, dir && (x_c + 4) % 32)
|| (x == state->tile->frame->width - 8 && dir == 1 && y_c % 8 == 0))
is_tu_boundary(state, x, y, dir, COLOR_UV, tree_type)
&& (is_on_8x8_grid(x_c, y_c, dir == EDGE_HOR && (x_c + 4) % 32 ? EDGE_HOR : EDGE_VER)
|| (x == state->tile->frame->width - 8 && dir == EDGE_HOR && y_c % 8 == 0))
&& tree_type != UVG_LUMA_T) {
filter_deblock_edge_chroma(state, x_c, y_c, length, dir, tu_boundary, tree_type);
filter_deblock_edge_chroma(state, x_c, y_c, 2, dir, tu_boundary, tree_type);
}
}
@ -1271,11 +1281,11 @@ static void filter_deblock_lcu_inside(encoder_state_t * const state,
for (int edge_y = y; edge_y < end_y; edge_y += 4) {
for (int edge_x = x; edge_x < end_x; edge_x += 4) {
bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, luma_tree);
bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, COLOR_Y, luma_tree);
if (tu_boundary || is_pu_boundary(state, edge_x, edge_y, dir)) {
filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, luma_tree);
}
if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, chroma_tree)) {
if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, COLOR_UV, chroma_tree)) {
filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, chroma_tree);
}
}
@ -1302,7 +1312,7 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
for (int x = x_px - 8; x < x_px; x += 4) {
for (int y = y_px; y < end; y += 4) {
// The top edge of the whole frame is not filtered.
bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, luma_tree);
bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, COLOR_Y, luma_tree);
if (y > 0 && (tu_boundary || is_pu_boundary(state, x, y, EDGE_HOR))) {
filter_deblock_edge_luma(state, x, y, 4, EDGE_HOR, tu_boundary);
}
@ -1313,13 +1323,15 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
if (state->encoder_control->chroma_format != UVG_CSP_400) {
const int x_px_c = x_px >> 1;
const int y_px_c = y_px >> 1;
const int x_c = x_px_c - 4;
const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
int x_c = x_px_c - 4;
const int end_c_y = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
for(; x_c < x_px_c; x_c += 2) {
for (int y_c = y_px_c; y_c < end_c_y; y_c += 8) {
// The top edge of the whole frame is not filtered.
bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, chroma_tree);
bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, COLOR_UV, chroma_tree);
if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) {
filter_deblock_edge_chroma(state, x_c , y_c, 4, EDGE_HOR, tu_boundary, chroma_tree);
filter_deblock_edge_chroma(state, x_c , y_c, 2, EDGE_HOR, tu_boundary, chroma_tree);
}
}
}
}

View file

@ -46,8 +46,8 @@
* \brief Edge direction.
*/
typedef enum edge_dir {
EDGE_VER = 0, // vertical
EDGE_HOR = 1, // horizontal
EDGE_VER = 1, // vertical
EDGE_HOR = 2, // horizontal
} edge_dir;

View file

@ -145,11 +145,11 @@ typedef int32_t mv_t;
#define INTERNAL_MV_PREC 4 // Internal motion vector precision, 4 = 1/16 pel
//! Limits for prediction block sizes. 0 = 64x64, 4 = 4x4.
//! Limits for prediction block sizes.
#define PU_DEPTH_INTER_MIN 0
#define PU_DEPTH_INTER_MAX 3
#define PU_DEPTH_INTER_MAX 8
#define PU_DEPTH_INTRA_MIN 0
#define PU_DEPTH_INTRA_MAX 4
#define PU_DEPTH_INTRA_MAX 8
//! Maximum number of layers in GOP structure (for allocating structures)
#define MAX_GOP_LAYERS 6
@ -273,7 +273,6 @@ typedef int32_t mv_t;
#define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
#define CLIP_TO_QP(value) CLIP(0, 51, (value))
#define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
#define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
#define CEILDIV(x,y) (((x) + (y) - 1) / (y))

View file

@ -375,23 +375,26 @@ static void inter_cp_with_ext_border(const uvg_pixel *ref_buf, int ref_stride,
* \param predict_luma Enable or disable luma prediction for this call.
* \param predict_chroma Enable or disable chroma prediction for this call.
*/
static unsigned inter_recon_unipred(const encoder_state_t * const state,
static unsigned inter_recon_unipred(
const encoder_state_t * const state,
const uvg_picture * const ref,
int32_t pu_x,
int32_t pu_y,
int32_t pu_w,
int32_t pu_h,
int32_t out_stride_luma,
const mv_t mv_param[2],
yuv_t *yuv_px,
yuv_im_t *yuv_im,
bool predict_luma,
bool predict_chroma)
bool predict_chroma,
const cu_loc_t* const cu_loc)
{
vector2d_t int_mv = { mv_param[0], mv_param[1] };
uvg_change_precision_vector2d(INTERNAL_MV_PREC, 0, &int_mv);
const int pu_x = cu_loc->x;
const int pu_y = cu_loc->y;
const int pu_w = cu_loc->width;
const int pu_h = cu_loc->height;
const vector2d_t int_mv_in_frame = {
int_mv.x + pu_x + state->tile->offset_x,
int_mv.y + pu_y + state->tile->offset_y
@ -507,17 +510,15 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
* \param predict_luma Enable or disable luma prediction for this call.
* \param predict_chroma Enable or disable chroma prediction for this call.
*/
void uvg_inter_recon_bipred(const encoder_state_t *const state,
void uvg_inter_recon_bipred(
const encoder_state_t *const state,
const uvg_picture *ref1,
const uvg_picture *ref2,
int32_t pu_x,
int32_t pu_y,
int32_t pu_w,
int32_t pu_h,
mv_t mv_param[2][2],
lcu_t *lcu,
bool predict_luma,
bool predict_chroma)
bool predict_chroma,
const cu_loc_t* const cu_loc)
{
// Allocate maximum size arrays for interpolated and copied samples
ALIGNED(64) uvg_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
@ -525,6 +526,11 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
ALIGNED(64) uvg_pixel_im im_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
ALIGNED(64) uvg_pixel_im im_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
const int pu_x = cu_loc->x;
const int pu_y = cu_loc->y;
const int pu_w = cu_loc->width;
const int pu_h = cu_loc->height;
yuv_t px_L0;
px_L0.size = pu_w * pu_h;
px_L0.y = &px_buf_L0[0];
@ -551,10 +557,10 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
// Sample blocks from both reference picture lists.
// Flags state if the outputs were written to high-precision / interpolated sample buffers.
unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[0],
&px_L0, &im_L0, predict_luma, predict_chroma);
unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[1],
&px_L1, &im_L1, predict_luma, predict_chroma);
unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_w, mv_param[0], &px_L0, &im_L0, predict_luma, predict_chroma,
cu_loc);
unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_w, mv_param[1], &px_L1, &im_L1, predict_luma, predict_chroma,
cu_loc);
// After reconstruction, merge the predictors by taking an average of each pixel
uvg_bipred_average(lcu, &px_L0, &px_L1, &im_L0, &im_L1,
@ -578,19 +584,14 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state,
* \param predict_luma Enable or disable luma prediction for this call.
* \param predict_chroma Enable or disable chroma prediction for this call.
*/
void uvg_inter_recon_cu(const encoder_state_t * const state,
void uvg_inter_recon_cu(
const encoder_state_t * const state,
lcu_t *lcu,
int32_t x,
int32_t y,
int32_t width,
bool predict_luma,
bool predict_chroma)
bool predict_chroma,
const cu_loc_t* const cu_loc)
{
cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
const int num_pu = uvg_part_mode_num_parts[cu->part_size];
for (int i = 0; i < num_pu; ++i) {
uvg_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i);
}
uvg_inter_pred_pu(state, lcu, predict_luma, predict_chroma, cu_loc);
}
static void ibc_recon_cu(const encoder_state_t * const state,
@ -599,8 +600,7 @@ static void ibc_recon_cu(const encoder_state_t * const state,
int32_t y,
int32_t width,
bool predict_luma,
bool predict_chroma,
int i_pu)
bool predict_chroma)
{
const int x_scu = SUB_SCU(x);
const int y_scu = SUB_SCU(y);
@ -668,79 +668,63 @@ static void ibc_recon_cu(const encoder_state_t * const state,
* \param predict_chroma Enable or disable chroma prediction for this call.
* \param i_pu Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP.
*/
void uvg_inter_pred_pu(const encoder_state_t * const state,
void uvg_inter_pred_pu(
const encoder_state_t * const state,
lcu_t *lcu,
int32_t x,
int32_t y,
int32_t width,
bool predict_luma,
bool predict_chroma,
int i_pu)
const cu_loc_t* const cu_loc)
{
const int x_scu = SUB_SCU(x);
const int y_scu = SUB_SCU(y);
cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
const int pu_h = PU_GET_H(cu->part_size, width, i_pu);
cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
if (cu->type == CU_IBC) {
ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu);
} else {
const int x_scu = SUB_SCU(cu_loc->x);
const int y_scu = SUB_SCU(cu_loc->y);
cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
if (pu->inter.mv_dir == 3) {
const uvg_picture * const refs[2] = {
state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]],
state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]],
const uvg_picture *const refs[2] = {
state->frame->ref->images[
state->frame->ref_LX[0][
pu->inter.mv_ref[0]]],
state->frame->ref->images[
state->frame->ref_LX[1][
pu->inter.mv_ref[1]]],
};
uvg_inter_recon_bipred(
state,
refs[0],
refs[1],
pu_x,
pu_y,
pu_w,
pu_h,
pu->inter.mv,
lcu,
predict_luma,
predict_chroma);
} else {
uvg_inter_recon_bipred(state,
refs[0], refs[1],
pu->inter.mv, lcu,
predict_luma, predict_chroma,
cu_loc);
}
else if (pu->type == CU_IBC) {
ibc_recon_cu(state, lcu, cu_loc->x, cu_loc->y, cu_loc->width, predict_luma, predict_chroma);
} else{
const int mv_idx = pu->inter.mv_dir - 1;
const uvg_picture * const ref =
state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]];
const uvg_picture *const ref =
state->frame->ref->images[
state->frame->ref_LX[mv_idx][
pu->inter.mv_ref[mv_idx]]];
const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
const unsigned offset_chroma =
SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
const unsigned offset_luma = SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x);
const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2;
yuv_t lcu_adapter;
lcu_adapter.size = pu_w * pu_h;
lcu_adapter.y = lcu->rec.y + offset_luma,
lcu_adapter.u = lcu->rec.u + offset_chroma,
lcu_adapter.v = lcu->rec.v + offset_chroma,
lcu_adapter.size = cu_loc->width * cu_loc->height;
lcu_adapter.y = lcu->rec.y + offset_luma;
lcu_adapter.u = lcu->rec.u + offset_chroma;
lcu_adapter.v = lcu->rec.v + offset_chroma;
inter_recon_unipred(
state,
inter_recon_unipred(state,
ref,
pu_x,
pu_y,
pu_w,
pu_h,
LCU_WIDTH,
pu->inter.mv[mv_idx],
LCU_WIDTH, pu->inter.mv[mv_idx],
&lcu_adapter,
NULL,
predict_luma,
predict_chroma);
}
predict_chroma,
cu_loc);
}
if (predict_chroma && state->encoder_control->cfg.jccr) {
const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
}
}
@ -915,11 +899,9 @@ static bool is_b0_cand_coded(int x, int y, int width, int height)
* \param ref_idx index in the reference list
* \param cand_out will be filled with C0 and C1 candidates
*/
static void get_temporal_merge_candidates(const encoder_state_t * const state,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
static void get_temporal_merge_candidates(
const encoder_state_t * const state,
const cu_loc_t* const cu_loc,
uint8_t ref_list,
uint8_t ref_idx,
merge_candidates_t *cand_out)
@ -951,8 +933,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
cu_array_t *ref_cu_array = state->frame->ref->cu_arrays[colocated_ref];
int cu_per_width = ref_cu_array->width / SCU_WIDTH;
int32_t xColBr = x + width;
int32_t yColBr = y + height;
int32_t xColBr = cu_loc->x + cu_loc->width;
int32_t yColBr = cu_loc->y + cu_loc->height;
// C0 must be available
if (xColBr < state->encoder_control->in.width &&
@ -972,8 +954,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state,
}
}
}
int32_t xColCtr = x + (width / 2);
int32_t yColCtr = y + (height / 2);
int32_t xColCtr = cu_loc->x + (cu_loc->width / 2);
int32_t yColCtr = cu_loc->y + (cu_loc->height / 2);
// C1 must be inside the LCU, in the center position of current CU
if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) {
@ -1254,10 +1236,7 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state,
* \param lcu current LCU
* \param cand_out will be filled with A and B candidates
*/
static void get_spatial_merge_candidates(int32_t x,
int32_t y,
int32_t width,
int32_t height,
static void get_spatial_merge_candidates(const cu_loc_t* const cu_loc,
int32_t picture_width,
int32_t picture_height,
lcu_t *lcu,
@ -1276,8 +1255,13 @@ static void get_spatial_merge_candidates(int32_t x,
|A1|_________|
|A0|
*/
int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
int32_t y_local = SUB_SCU(y);
const int32_t x_local = SUB_SCU(cu_loc->x); //!< coordinates from top-left of this LCU
const int32_t y_local = SUB_SCU(cu_loc->y);
const int x = cu_loc->x;
const int y = cu_loc->y;
const int width = cu_loc->width;
const int height = cu_loc->height;
// A0 and A1 availability testing
if (x != 0) {
cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
@ -1350,15 +1334,13 @@ static void get_spatial_merge_candidates(int32_t x,
* \param picture_height tile height in pixels
* \param cand_out will be filled with A and B candidates
*/
static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
static void get_spatial_merge_candidates_cua(
const cu_array_t *cua,
int32_t picture_width,
int32_t picture_height,
merge_candidates_t *cand_out,
bool wpp)
bool wpp,
const cu_loc_t* const cu_loc)
{
/*
Predictor block locations
@ -1370,8 +1352,12 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
|A1|_________|
|A0|
*/
int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
int32_t y_local = SUB_SCU(y);
const int x = cu_loc->x;
const int y = cu_loc->y;
const int width = cu_loc->width;
const int height = cu_loc->height;
const int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
const int32_t y_local = SUB_SCU(y);
// A0 and A1 availability testing
if (x != 0) {
const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1);
@ -1484,15 +1470,13 @@ static bool add_temporal_candidate(const encoder_state_t *state,
/**
* \brief Pick two mv candidates from the spatial and temporal candidates.
*/
static void get_mv_cand_from_candidates(const encoder_state_t * const state,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
static void get_mv_cand_from_candidates(
const encoder_state_t * const state,
const merge_candidates_t *merge_cand,
const cu_info_t * const cur_cu,
int8_t reflist,
mv_t mv_cand[2][2])
mv_t mv_cand[2][2],
int ctu_row)
{
const cu_info_t *const *a = merge_cand->a;
const cu_info_t *const *b = merge_cand->b;
@ -1552,7 +1536,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
if (candidates < AMVP_MAX_NUM_CANDS)
{
const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];
for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) {
@ -1595,32 +1578,30 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
* \param lcu current LCU
* \param reflist reflist index (either 0 or 1)
*/
void uvg_inter_get_mv_cand(const encoder_state_t * const state,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
void uvg_inter_get_mv_cand(
const encoder_state_t * const state,
mv_t mv_cand[2][2],
const cu_info_t * const cur_cu,
lcu_t *lcu,
int8_t reflist)
int8_t reflist,
const cu_loc_t* const cu_loc)
{
merge_candidates_t merge_cand = { 0 };
const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
if (cur_cu->type == CU_IBC) {
mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);
} else {
get_spatial_merge_candidates(x, y, width, height,
state->tile->frame->width,
state->tile->frame->height,
lcu,
&merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp);
get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
&merge_cand,
parallel_merge_level,
state->encoder_control->cfg.wpp);
get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
}
uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
}
@ -1637,31 +1618,29 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
* \param cur_cu current CU
* \param reflist reflist index (either 0 or 1)
*/
void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
void uvg_inter_get_mv_cand_cua(
const encoder_state_t * const state,
mv_t mv_cand[2][2],
const cu_info_t* cur_cu,
int8_t reflist)
int8_t reflist,
const cu_loc_t* const cu_loc)
{
merge_candidates_t merge_cand = { 0 };
const cu_array_t *cua = state->tile->frame->cu_array;
if (cur_cu->type == CU_IBC) {
mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand);
get_ibc_merge_candidates(state, cur_cu, NULL,cua,cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2);
memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2);
} else {
get_spatial_merge_candidates_cua(cua,
x, y, width, height,
state->tile->frame->width, state->tile->frame->height,
&merge_cand, state->encoder_control->cfg.wpp);
get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
state->tile->frame->width, state->tile->frame->height, &merge_cand, state->encoder_control->cfg.wpp,
cu_loc);
get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH);
}
uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]);
uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]);
}
@ -1885,10 +1864,9 @@ void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv) {
* \param lcu lcu containing the block
* \return number of merge candidates
*/
uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
int32_t x, int32_t y,
int32_t width, int32_t height,
bool use_a1, bool use_b1,
uint8_t uvg_inter_get_merge_cand(
const encoder_state_t * const state,
const cu_loc_t* const cu_loc,
inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
lcu_t *lcu)
{
@ -1897,11 +1875,12 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level;
merge_candidates_t merge_cand = { 0 };
const uint8_t max_num_cands = state->encoder_control->cfg.max_merge;
// Current CU
cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(cu_loc->x), SUB_SCU(cu_loc->y));
cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
if(cur_cu->type == CU_IBC) {
mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2];
get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand);
get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand);
for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) {
mv_cand[i].dir = 1;
mv_cand[i].mv[0][0] = ibc_mv_cand[i][0];
@ -1909,18 +1888,16 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
}
return IBC_MRG_MAX_NUM_CANDS;
}
get_spatial_merge_candidates(x, y, width, height,
state->tile->frame->width,
state->tile->frame->height,
lcu,
&merge_cand, parallel_merge_level, state->encoder_control->cfg.wpp);
get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu,
&merge_cand,
parallel_merge_level,
state->encoder_control->cfg.wpp);
const cu_info_t **a = merge_cand.a;
const cu_info_t **b = merge_cand.b;
if (!use_a1) a[1] = NULL;
if (!use_b1) b[1] = NULL;
const int x = cu_loc->x;
const int y = cu_loc->y;
if (different_mer(x, y, x, y - 1, parallel_merge_level) && add_merge_candidate(b[1], NULL, NULL, &mv_cand[candidates])) candidates++;
if (different_mer(x, y, x - 1, y, parallel_merge_level) && add_merge_candidate(a[1], b[1], NULL, &mv_cand[candidates])) candidates++;
@ -1941,7 +1918,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
for (int reflist = 0; reflist <= max_reflist; reflist++) {
// Fetch temporal candidates for the current CU
// ToDo: change collocated_from_l0_flag to allow L1 ref
get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand);
// TODO: enable L1 TMVP candidate
// get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand);
@ -1973,7 +1950,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
if (candidates == max_num_cands) return candidates;
if (candidates != max_num_cands - 1) {
const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH);
const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
int32_t num_cand = state->tile->frame->hmvp_size[ctu_row];

View file

@ -58,61 +58,51 @@ void uvg_change_precision_vector2d(int src, int dst, vector2d_t* mv);
void uvg_round_precision(int src, int dst, mv_t* hor, mv_t* ver);
void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv);
void uvg_inter_recon_cu(const encoder_state_t * const state,
void uvg_inter_recon_cu(
const encoder_state_t * const state,
lcu_t *lcu,
int32_t x,
int32_t y,
int32_t width,
bool predict_luma,
bool predict_chroma);
void uvg_inter_pred_pu(const encoder_state_t * const state,
lcu_t *lcu,
int32_t x,
int32_t y,
int32_t width,
bool predict_luma,
bool predict_chroma,
int i_pu);
const cu_loc_t* const cu_loc);
void uvg_inter_pred_pu(
const encoder_state_t * const state,
lcu_t *lcu,
bool predict_luma,
bool predict_chroma,
const cu_loc_t* const cu_loc);
void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu);
void uvg_inter_recon_bipred(const encoder_state_t * const state,
void uvg_inter_recon_bipred(
const encoder_state_t * const state,
const uvg_picture * ref1,
const uvg_picture * ref2,
int32_t xpos,
int32_t ypos,
int32_t width,
int32_t height,
mv_t mv_param[2][2],
lcu_t* lcu,
bool predict_luma,
bool predict_chroma);
bool predict_chroma,
const cu_loc_t* const cu_loc);
void uvg_inter_get_mv_cand(const encoder_state_t * const state,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
void uvg_inter_get_mv_cand(
const encoder_state_t * const state,
mv_t mv_cand[2][2],
const cu_info_t* cur_cu,
lcu_t *lcu,
int8_t reflist);
int8_t reflist,
const cu_loc_t* const cu_loc);
void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state,
int32_t x,
int32_t y,
int32_t width,
int32_t height,
void uvg_inter_get_mv_cand_cua(
const encoder_state_t * const state,
mv_t mv_cand[2][2],
const cu_info_t* cur_cu,
int8_t reflist);
int8_t reflist,
const cu_loc_t* const cu_loc);
uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state,
int32_t x, int32_t y,
int32_t width, int32_t height,
bool use_a1, bool use_b1,
uint8_t uvg_inter_get_merge_cand(
const encoder_state_t * const state,
const cu_loc_t* const cu_loc,
inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS],
lcu_t *lcu);
#endif

File diff suppressed because it is too large Load diff

View file

@ -71,6 +71,7 @@ typedef struct {
double coeff_bits;
double distortion;
double lfnst_costs[3];
uint8_t best_isp_cbfs;
} intra_search_data_t ;
@ -107,7 +108,9 @@ int8_t uvg_intra_get_dir_luma_predictor(
* \param multi_ref_idx Multi reference line index for the prediction block.
*/
void uvg_intra_build_reference(
const int_fast8_t log2_width,
const encoder_state_t* const state,
const cu_loc_t* const pu_loc,
const cu_loc_t* const cu_loc,
const color_t color,
const vector2d_t *const luma_px,
const vector2d_t *const pic_px,
@ -115,7 +118,8 @@ void uvg_intra_build_reference(
uvg_intra_references *const refs,
bool entropy_sync,
uvg_pixel *extra_refs,
uint8_t multi_ref_idx);
uint8_t multi_ref_idx,
const uint8_t isp_mode);
/**
* \brief Generate intra predictions.
@ -130,32 +134,60 @@ void uvg_intra_predict(
const encoder_state_t* const state,
uvg_intra_references* const refs,
const cu_loc_t* const cu_loc,
const cu_loc_t* const pu_loc,
const color_t color,
uvg_pixel* dst,
const intra_search_data_t* data,
const lcu_t* lcu,
enum uvg_tree_type tree_type
);
const lcu_t* lcu
);
void uvg_intra_recon_cu(
encoder_state_t* const state,
int x,
int y,
int depth,
intra_search_data_t* search_data,
const cu_loc_t* cu_loc,
cu_info_t *cur_cu,
lcu_t *lcu,
enum uvg_tree_type tree_type,
bool recon_luma,
bool recon_chroma);
const cu_info_t* uvg_get_co_located_luma_cu(
int x,
int y,
int width,
int height,
double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state,
const cu_loc_t* const cu_loc,
double cost_treshold,
intra_search_data_t* const search_data,
lcu_t* const lcu, bool* violates_lfnst);
int8_t uvg_get_co_located_luma_mode(
const cu_loc_t* const chroma_loc,
const cu_loc_t* const cu_loc,
const cu_info_t* luma_cu,
const lcu_t* const lcu,
const cu_array_t* const cu_array,
enum uvg_tree_type tree_type);
bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t* const luma_loc, cu_info_t const* const cur_cu, enum
uvg_tree_type tree_type);
int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
uint8_t uvg_get_mip_flag_context(
const cu_loc_t* const cu_loc,
const lcu_t* lcu,
cu_array_t* const cu_a);
int8_t uvg_wide_angle_correction(
int_fast8_t mode,
const int log2_width,
const int log2_height,
const bool account_for_dc_planar);
// ISP related defines
#define NUM_ISP_MODES 3
#define ISP_MODE_NO_ISP 0
#define ISP_MODE_HOR 1
#define ISP_MODE_VER 2
#define SPLIT_TYPE_HOR 1
#define SPLIT_TYPE_VER 2
int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_block);
int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_block);
void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_block);
bool uvg_can_use_isp(const int width, const int height);
bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type);

View file

@ -795,12 +795,20 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
state->frame->QP + 2 + frame_allocation,
est_qp);
}
if(state->encoder_control->cfg.dep_quant) {
est_lambda *= pow(2, 0.25 / 3.0);
}
state->lambda = est_lambda;
state->lambda_sqrt = sqrt(est_lambda);
state->qp = est_qp;
int8_t chroma_qp = encoder->qp_map[0][est_qp];
double tmpWeight = pow(2.0, (est_qp - chroma_qp) / 3.0);
if (state->encoder_control->cfg.dep_quant)
{
tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
}
state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
state->c_lambda = est_lambda / tmpWeight;
ctu->qp = est_qp;
ctu->lambda = est_lambda;
@ -820,7 +828,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
state->qp = CLIP_TO_QP(state->qp);
state->lambda = qp_to_lambda(state, state->qp);
double to_lambda = qp_to_lambda(state, state->qp);
if (state->encoder_control->cfg.dep_quant) {
to_lambda *= pow(2, 0.25 / 3.0);
}
state->lambda = to_lambda;
state->lambda_sqrt = sqrt(state->lambda);
ctu->adjust_lambda = state->lambda;
@ -1103,7 +1115,12 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
pos.x = 0;
}
state->qp = CLIP_TO_QP(state->frame->QP + dqp);
state->lambda = qp_to_lambda(state, state->qp);
double to_lambda = qp_to_lambda(state, state->qp);
if (state->encoder_control->cfg.dep_quant) {
to_lambda *= pow(2, 0.25 / 3.0);
}
state->lambda = to_lambda;
state->lambda_sqrt = sqrt(state->lambda);
}
else if (ctrl->cfg.target_bitrate > 0) {
@ -1138,6 +1155,9 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
state->frame->lambda * 1.5874010519681994,
lambda);
lambda = clip_lambda(lambda);
if (state->encoder_control->cfg.dep_quant) {
lambda *= pow(2, 0.25 / 3.0);
}
state->lambda = lambda;
state->lambda_sqrt = sqrt(lambda);
@ -1145,8 +1165,13 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
} else {
state->qp = state->frame->QP;
state->lambda = state->frame->lambda;
state->lambda_sqrt = sqrt(state->frame->lambda);
double lambda = state->frame->lambda;
if (state->encoder_control->cfg.dep_quant) {
lambda *= pow(2, 0.25 / 3.0);
}
state->lambda = lambda;
state->lambda_sqrt = sqrt(lambda);
}
lcu->lambda = state->lambda;
@ -1154,6 +1179,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
int8_t chroma_qp = ctrl->qp_map[0][state->qp];
double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
if (state->encoder_control->cfg.dep_quant)
{
tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
}
state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight;
state->c_lambda = state->lambda / tmpWeight;
// Apply variance adaptive quantization
@ -1170,10 +1200,34 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp);
state->qp = CLIP_TO_QP(state->qp);
state->lambda = qp_to_lambda(state, state->qp);
double to_lambda = qp_to_lambda(state, state->qp);
if (state->encoder_control->cfg.dep_quant) {
to_lambda *= pow(2, 0.25 / 3.0);
}
state->lambda = to_lambda;
state->lambda_sqrt = sqrt(state->lambda);
lcu->adjust_lambda = state->lambda;
lcu->adjust_qp = state->qp;
}
}
double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode)
{
const encoder_control_t * const ctrl = state->encoder_control;
double lambda = state->lambda;
int8_t chroma_qp = ctrl->qp_map[0][state->qp];
double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0);
if (state->encoder_control->cfg.dep_quant) {
tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma)
}
lambda /= tmpWeight;
lambda *= use_jccr && state->qp > 18 ? 1.3 : 1.0;
if (jccr_mode == 1 || jccr_mode == 2) {
lambda *= 0.8;
} else if (jccr_mode == 3) {
lambda *= 0.5;
}
return lambda;
}

View file

@ -76,4 +76,6 @@ void uvg_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos);
void uvg_update_after_picture(encoder_state_t * const state);
void uvg_estimate_pic_lambda(encoder_state_t * const state);
double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode);
#endif // RATE_CONTROL_H_

272
src/rdo.c
View file

@ -33,6 +33,7 @@
#include "rdo.h"
#include <errno.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
@ -52,7 +53,6 @@
#include "strategies/strategies-quant.h"
#define QUANT_SHIFT 14
#define SCAN_SET_SIZE 16
#define LOG2_SCAN_SET_SIZE 4
#define SBH_THRESHOLD 4
@ -297,15 +297,20 @@ out:
static INLINE double get_coeff_cabac_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
int32_t width,
const cu_loc_t* const cu_loc,
color_t color,
int8_t scan_mode,
int8_t tr_skip,
cu_info_t* cur_tu)
{
const int width = cu_loc->width;
const int height = cu_loc->height;
const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
// Make sure there are coeffs present
bool found = false;
for (int i = 0; i < width*width; i++) {
for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) {
if (coeff[i] != 0) {
found = 1;
break;
@ -330,7 +335,7 @@ static INLINE double get_coeff_cabac_cost(
uvg_encode_coeff_nxn((encoder_state_t*) state,
&cabac_copy,
coeff,
width,
cu_loc,
color,
scan_mode,
cur_tu,
@ -341,6 +346,7 @@ static INLINE double get_coeff_cabac_cost(
&cabac_copy,
coeff,
width,
height,
color,
scan_mode,
&bits);
@ -391,14 +397,36 @@ double uvg_get_coeff_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
cu_info_t* cur_tu,
int32_t width,
const cu_loc_t* const cu_loc,
color_t color,
int8_t scan_mode,
int8_t tr_skip)
int8_t tr_skip,
int coeff_order)
{
uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
int x_local = cu_loc->x % LCU_WIDTH;
int y_local = cu_loc->y % LCU_WIDTH;
const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C;
const coeff_t* coeff_ptr = NULL;
coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
if (coeff_order == COEFF_ORDER_LINEAR) {
coeff_ptr = coeff;
}
else {
// Coeff order CU
uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width);
coeff_ptr = sub_coeff;
}
if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
state->qp < MAX_FAST_COEFF_COST_QP && !tr_skip) {
// TODO: do we need to assert(0) out of the fast-estimation branch if we
@ -409,17 +437,17 @@ double uvg_get_coeff_cost(
return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
} else {
uint64_t weights = uvg_fast_coeff_get_weights(state);
uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
uint32_t fast_cost = uvg_fast_coeff_cost(coeff_ptr, width, height, weights);
if (check_accuracy) {
double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
save_accuracy(state->qp, ccc, fast_cost);
}
return fast_cost;
}
} else {
double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu);
if (save_cccs) {
save_ccc(state->qp, coeff, width * width, ccc);
save_ccc(state->qp, coeff, width * height, ccc);
}
return ccc;
}
@ -684,12 +712,13 @@ void uvg_rdoq_sign_hiding(
const int32_t last_pos,
const coeff_t *const coeffs,
coeff_t *const quant_coeffs,
const int8_t color)
const int8_t color,
const bool need_sqrt_adjust)
{
const encoder_control_t * const ctrl = state->encoder_control;
const double lambda = color ? state->c_lambda : state->lambda;
int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6];
int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6];
// This somehow scales quant_delta into fractional bits. Instead of the bits
// being multiplied by lambda, the residual is divided by it, or something
// like that.
@ -814,28 +843,28 @@ void uvg_rdoq_sign_hiding(
}
}
static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t posX, uint32_t posY, uint32_t width, uint32_t height)
static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t posX, uint32_t posY, uint32_t width, uint32_t height, uint8_t mts_index)
{
const coeff_t* pData = coeff + posX + posY * width;
coeff_t sum = 0;
if (posX < width - 1)
{
sum += abs(pData[1]);
sum += mts_index && posX + 1 >= 16 ? 0 : abs(pData[1]);
if (posX < width - 2)
{
sum += abs(pData[2]);
sum += mts_index && posX + 2 >= 16 ? 0 : abs(pData[2]);
}
if (posY < height - 1)
{
sum += abs(pData[width + 1]);
sum += mts_index && (posY + 1 >= 16 || posX + 1 >= 16) ? 0 : abs(pData[width + 1]);
}
}
if (posY < height - 1)
{
sum += abs(pData[width]);
sum += mts_index && posY + 1 >= 16 ? 0 : abs(pData[width]);
if (posY < height - 2)
{
sum += abs(pData[width << 1]);
sum += mts_index && posY + 2 >= 16 ? 0 : abs(pData[width << 1]);
}
}
return MAX(MIN(sum - 5 * baseLevel, 31), 0);
@ -1141,7 +1170,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
const int max_log2_tr_dynamic_range = 15;
uint32_t log2_tr_width = uvg_math_floor_log2(width);
uint32_t log2_tr_height = uvg_math_floor_log2(height);
const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
const uint32_t log2_cg_width = g_log2_sbb_size[log2_tr_width][log2_tr_height][0];
const uint32_t log2_cg_height = g_log2_sbb_size[log2_tr_width][log2_tr_height][1];
@ -1166,15 +1196,18 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
switch (cg_num) {
case 1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); FILL_ARRAY(cost_coeffgroup_sig, 0, 1); break;
case 2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); FILL_ARRAY(cost_coeffgroup_sig, 0, 2); break;
case 4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); FILL_ARRAY(cost_coeffgroup_sig, 0, 4); break;
case 8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); FILL_ARRAY(cost_coeffgroup_sig, 0, 8); break;
case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); FILL_ARRAY(cost_coeffgroup_sig, 0, 16); break;
case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); FILL_ARRAY(cost_coeffgroup_sig, 0, 32); break;
case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); FILL_ARRAY(cost_coeffgroup_sig, 0, 64); break;
default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
}
const bool needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation.
const int q_bits = QUANT_SHIFT + qp_scaled / 6 + (needs_sqrt2_scale ? -1 : 0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits
const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6];
const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6];
const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff;
@ -1182,8 +1215,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1;
const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
uint32_t coeff_levels[3];
double coeff_level_error[4];
@ -1221,8 +1254,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
scan_pos = (sbId << log2_cg_size) + scan_pos_in_sb;
int last_pos_coded = sbSizeM1;
uint32_t blkpos = scan[scan_pos];
uint32_t pos_y = blkpos >> log2_block_size;
uint32_t pos_x = blkpos - (pos_y << log2_block_size);
uint32_t pos_y = blkpos >> log2_block_width;
uint32_t pos_x = blkpos - (pos_y << log2_block_width);
//===== quantization =====
// set coeff
@ -1365,6 +1398,48 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
return abs_sum;
}
static uint32_t context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y,
uint32_t width, uint32_t height, int8_t color,
int32_t* temp_diag, int32_t* temp_sum, int8_t mts)
{
const coeff_t* data = coeff + pos_x + pos_y * width;
const int diag = pos_x + pos_y;
int num_pos = 0;
int sum_abs = 0;
#define UPDATE(x) {int a=abs(x);sum_abs+=MIN(4+(a&1),a);num_pos+=(a?1:0);}
if (pos_x < width - 1)
{
UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[1]);
if (pos_x < width - 2)
{
UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[2]);
}
if (pos_y < height - 1)
{
UPDATE(mts && (pos_y + 1 >= 16 || pos_x + 1 >= 16) ? 0 : data[width + 1]);
}
}
if (pos_y < height - 1)
{
UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[width]);
if (pos_y < height - 2)
{
UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[width << 1]);
}
}
#undef UPDATE
int ctx_ofs = MIN((sum_abs + 1) >> 1, 3) + (diag < 2 ? 4 : 0);
if (color == COLOR_Y)
{
ctx_ofs += diag < 5 ? 4 : 0;
}
*temp_diag = diag;
*temp_sum = sum_abs - num_pos;
return ctx_ofs;
}
/** RDOQ with CABAC
* \returns void
* Rate distortion optimized quantization for entropy
@ -1377,31 +1452,35 @@ void uvg_rdoq(
coeff_t *dest_coeff,
int32_t width,
int32_t height,
int8_t type,
int8_t color,
int8_t scan_mode,
int8_t block_type,
int8_t tr_depth,
uint16_t cbf,
uint8_t lfnst_idx)
uint8_t lfnst_idx, uint8_t mts_idx)
{
const encoder_control_t * const encoder = state->encoder_control;
cabac_data_t * const cabac = &state->cabac;
uint32_t log2_tr_width = uvg_math_floor_log2( height );
uint32_t log2_tr_height = uvg_math_floor_log2( width );
int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); // Represents scaling through forward transform
const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
needs_block_size_trafo_scale |= 0; // Non log2 block size
int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1); // Represents scaling through forward transform
uint16_t go_rice_param = 0;
uint32_t reg_bins = (width * height * 28) >> 4;
const uint32_t log2_block_size = uvg_g_convert_to_bit[ width ] + 2;
int32_t scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + type;
int32_t qp_scaled = uvg_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
int32_t scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color;
int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
const double lambda = type ? state->c_lambda : state->lambda;
int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift - needs_block_size_trafo_scale;
const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
const double *err_scale = encoder->scaling_list.error_scale[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
const double lambda = color ? state->c_lambda : state->lambda;
const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
const double *err_scale = encoder->scaling_list.error_scale[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6];
double block_uncoded_cost = 0;
@ -1415,14 +1494,19 @@ void uvg_rdoq(
memset(dest_coeff, 0, sizeof(coeff_t) * width * height);
const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
const uint32_t cg_size = 16;
const int32_t shift = 4 >> 1;
const uint32_t num_blk_side = width >> shift;
const uint32_t num_blk_side = MAX(width >> shift, 1);
double cost_coeffgroup_sig[ 64 ];
uint32_t sig_coeffgroup_flag[ 64 ];
@ -1431,26 +1515,34 @@ void uvg_rdoq(
int32_t temp_diag = -1;
int32_t temp_sum = -1;
const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
int32_t cg_last_scanpos = -1;
int32_t last_scanpos = -1;
uint32_t cg_num = width * height >> 4;
uint32_t cg_num = lfnst_idx > 0 ? 1 : width * height >> 4;
double dTransShift = (double)transform_shift + (needs_block_size_trafo_scale ? -0.5 : 0.0);
// Compensate for scaling of bitcount in Lagrange cost function
double scale = CTX_FRAC_ONE_BIT;
// Compensate for scaling through forward transform
scale = scale * pow(2.0, -2.0 * dTransShift);
const double default_error_scale = scale / default_quant_coeff / default_quant_coeff;
// Explicitly tell the only possible numbers of elements to be zeroed.
// Hope the compiler is able to utilize this information.
switch (cg_num) {
case 1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); break;
case 2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); break;
case 4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); break;
case 8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); break;
case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); break;
case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
default: assert(0 && "There should be 1, 2, 4, 8, 16, 32 or 64 coefficient groups");
}
cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[type ? 2 : 0]);
cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
cabac_ctx_t* base_gt1_ctx = (type == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[color ? 2 : 0]);
cabac_ctx_t *baseCtx = (color == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]);
cabac_ctx_t* base_gt1_ctx = (color == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]);
struct {
double coded_level_and_dist;
@ -1463,21 +1555,26 @@ void uvg_rdoq(
//Find last cg and last scanpos
const int max_lfnst_pos = ((height == 4 && width == 4) || (height == 8 && width == 8)) ? 7 : 15;
int32_t cg_scanpos;
uint32_t max_scan_group_size = lfnst_idx > 0 ? max_lfnst_pos : cg_size - 1;
for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
{
for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
uint32_t cg_blkpos = scan_cg[cg_scanpos];
uint32_t cg_pos_y = cg_blkpos / num_blk_side;
uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side);
if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--)
{
int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break;
uint32_t blkpos = scan[scanpos];
int32_t q = quant_coeff[blkpos];
int32_t q = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
int32_t level_double = coef[blkpos];
level_double = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits;
double err = (double)level_double;
cost_coeff0[scanpos] = err * err * err_scale[blkpos];
cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);
dest_coeff[blkpos] = max_abs_level;
if (max_abs_level > 0) {
@ -1507,43 +1604,45 @@ void uvg_rdoq(
uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side);
FILL(rd_stats, 0);
for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue;
for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
if (scanpos > last_scanpos) {
continue;
}
uint32_t blkpos = scan[scanpos];
int32_t q = quant_coeff[blkpos];
double temp = err_scale[blkpos];
int32_t q = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff;
double temp = (use_scaling_list ? err_scale[blkpos] : default_error_scale);
int32_t level_double = coef[blkpos];
level_double = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits;
dest_coeff[blkpos] = max_abs_level;
double err = (double)level_double;
cost_coeff0[scanpos] = err * err * err_scale[blkpos];
cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale);
block_uncoded_cost += cost_coeff0[ scanpos ];
if (last_scanpos >= 0) {
uint32_t pos_y = blkpos >> log2_block_size;
uint32_t pos_x = blkpos - (pos_y << log2_block_size);
uint32_t pos_y = blkpos >> log2_block_width;
uint32_t pos_x = blkpos - (pos_y << log2_block_width);
//===== coefficient level estimation =====
int32_t level;
uint16_t ctx_sig = 0;
if (scanpos != last_scanpos) {
ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, type, &temp_diag, &temp_sum);
// VVC document 9.3.4.2.8, context for sig_coeff_flag calculated here
ctx_sig = context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum, mts_idx);
}
if (temp_diag != -1) {
ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((type == 0) ? 15 : 5) : (type == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((color == 0) ? 15 : 5) : (color == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0);
}
else ctx_set = 0;
if (reg_bins < 4) {
int sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height);
int sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height,mts_idx);
go_rice_param = g_auiGoRiceParsCoeff[sumAll];
}
@ -1554,12 +1653,12 @@ void uvg_rdoq(
if (scanpos == last_scanpos) {
level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
level_double, max_abs_level, 0, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
reg_bins, q_bits, temp, 1, type);
reg_bins, q_bits, temp, 1, color);
}
else {
level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos],
level_double, max_abs_level, ctx_sig, gt1_ctx, gt2_ctx, par_ctx, go_rice_param,
reg_bins, q_bits, temp, 0, type);
reg_bins, q_bits, temp, 0, color);
if (encoder->cfg.signhide_enable) {
int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1);
int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0);
@ -1572,14 +1671,14 @@ void uvg_rdoq(
if (encoder->cfg.signhide_enable) {
sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
if (level > 0) {
int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
}
else { // level == 0
if (reg_bins < 4) {
int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false);
sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now;
int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false);
sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now;
}
else {
sh_rates.inc[blkpos] = CTX_ENTROPY_BITS(&base_gt1_ctx[gt1_ctx], 0);
@ -1595,7 +1694,7 @@ void uvg_rdoq(
}
else if (reg_bins >= 4) {
reg_bins -= (level < 2 ? level : 3) + (scanpos != last_scanpos);
int sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height);
int sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height, mts_idx);
go_rice_param = g_auiGoRiceParsCoeff[sumAll];
}
}
@ -1620,7 +1719,7 @@ void uvg_rdoq(
if( cg_scanpos ) {
if (sig_coeffgroup_flag[cg_blkpos] == 0) {
uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
cg_pos_y, cg_width);
cg_pos_y, cg_width, cg_height);
cost_coeffgroup_sig[cg_scanpos] = lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
base_cost += cost_coeffgroup_sig[cg_scanpos] - rd_stats.sig_cost;
} else {
@ -1636,7 +1735,7 @@ void uvg_rdoq(
// add SigCoeffGroupFlag cost to total cost
ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
cg_pos_y, cg_width);
cg_pos_y, cg_width, cg_height);
cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
base_cost += cost_coeffgroup_sig[cg_scanpos];
@ -1656,7 +1755,7 @@ void uvg_rdoq(
cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
// reset coeffs to 0 in this block
for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
uint32_t blkpos = scan[scanpos];
if (dest_coeff[blkpos]){
@ -1679,12 +1778,12 @@ void uvg_rdoq(
int8_t found_last = 0;
int32_t best_last_idx_p1 = 0;
if( block_type != CU_INTRA && !type ) {
if( block_type != CU_INTRA && !color ) {
best_cost = block_uncoded_cost + lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
base_cost += lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
} else {
cabac_ctx_t* base_cbf_model = NULL;
switch (type) {
switch (color) {
case COLOR_Y:
base_cbf_model = cabac->ctx.qt_cbf_model_luma;
break;
@ -1697,25 +1796,26 @@ void uvg_rdoq(
default:
assert(0);
}
ctx_cbf = ( type != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U));
// This cbf should work even with non-square blocks
ctx_cbf = ( color != COLOR_V ? 0 : cbf_is_set(cbf, COLOR_U));
best_cost = block_uncoded_cost + lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
base_cost += lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
}
calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
calc_last_bits(state, width, height, color, last_x_bits, last_y_bits);
for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
uint32_t cg_blkpos = scan_cg[cg_scanpos];
base_cost -= cost_coeffgroup_sig[cg_scanpos];
if (sig_coeffgroup_flag[ cg_blkpos ]) {
for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
for ( int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) {
int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
if (scanpos > last_scanpos) continue;
uint32_t blkpos = scan[scanpos];
if( dest_coeff[ blkpos ] ) {
uint32_t pos_y = blkpos >> log2_block_size;
uint32_t pos_x = blkpos - ( pos_y << log2_block_size );
uint32_t pos_y = blkpos >> log2_block_width;
uint32_t pos_x = blkpos - ( pos_y << log2_block_width );
double cost_last = get_rate_last(lambda, pos_x, pos_y, last_x_bits,last_y_bits );
double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
@ -1739,19 +1839,31 @@ void uvg_rdoq(
} // end for
uint32_t abs_sum = 0;
if(!mts_idx || (width < 32 && height < 32)) {
for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
int32_t blkPos = scan[scanpos];
int32_t level = dest_coeff[blkPos];
abs_sum += level;
dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
}
}
else {
for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
int32_t blkPos = scan[scanpos];
int32_t blk_x = blkPos & (width - 1);
int32_t blk_y = blkPos >> log2_block_width;
int32_t level = blk_x >= 16 || blk_y >= 16 ? 0 : dest_coeff[blkPos];
abs_sum += level;
dest_coeff[blkPos] = (coeff_t)(( level < 0 ) ? -level : level);
}
}
//===== clean uncoded coefficients =====
for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
dest_coeff[scan[scanpos]] = 0;
}
if (encoder->cfg.signhide_enable && abs_sum >= 2) {
uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, type);
uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale);
}
}

View file

@ -44,6 +44,8 @@
#include "global.h" // IWYU pragma: keep
#include "search_inter.h"
#define QUANT_SHIFT 14
#define IQUANT_SHIFT 6
extern const uint32_t uvg_g_go_rice_range[5];
extern const uint32_t uvg_g_go_rice_prefix_len[5];
@ -60,9 +62,8 @@ void uvg_rdoq(
int8_t type,
int8_t scan_mode,
int8_t block_type,
int8_t tr_depth,
uint16_t cbf,
uint8_t lfnst_idx);
uint8_t lfnst_idx, uint8_t mts_idx);
int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_coeff, int32_t width,
@ -73,10 +74,11 @@ double uvg_get_coeff_cost(
const encoder_state_t * const state,
const coeff_t *coeff,
cu_info_t* cur_tu,
int32_t width,
const cu_loc_t* const cu_loc,
color_t color,
int8_t scan_mode,
int8_t tr_skip);
int8_t tr_skip,
int coeff_order);
int32_t uvg_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_gt1, uint16_t ctx_num_gt2, uint16_t ctx_num_par,
uint16_t abs_go_rice, uint32_t reg_bins, int8_t type, int use_limited_prefix_length);

View file

@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] =
24, 25, 28, 33, 41, 54, 71, 91
};
const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564};
const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72};
const int16_t uvg_g_quant_scales[2][6] = {
{26214, 23302, 20560, 18396, 16384, 14564},
{ 18396,16384,14564,13107,11651,10280 }
};
const int16_t uvg_g_inv_quant_scales[2][6] = {
{40, 45, 51, 57, 64, 72},
{ 57,64,72,80,90,102 }
};
/**
@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons
int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp];
int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp];
// Encoder list
uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio,
// Encoder list TODO: the sqrt adjusted lists
uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio,
MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
// Decoder list
scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio,
scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio,
MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);

File diff suppressed because it is too large Load diff

View file

@ -84,19 +84,17 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);
void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);
double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
const int x_px, const int y_px, const int depth,
double uvg_cu_rd_cost_luma(
const encoder_state_t *const state,
const cu_loc_t* const cu_loc,
const cu_info_t *const pred_cu,
lcu_t *const lcu);
double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
const int x_px, const int y_px, const int depth,
lcu_t *const lcu,
uint8_t isp_cbf);
double uvg_cu_rd_cost_chroma(
const encoder_state_t *const state,
cu_info_t *const pred_cu,
lcu_t *const lcu);
lcu_t *const lcu,
const cu_loc_t * const);
void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
tree_type);
void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
#endif

View file

@ -75,7 +75,8 @@ typedef struct {
* \brief Possible optimized SAD implementation for the width, leave as
* NULL for arbitrary-width blocks
*/
optimized_sad_func_ptr_t optimized_sad;
optimized_sad_func_ptr_t optimized_sad_y;
optimized_sad_func_ptr_t optimized_sad_uv;
lcu_t *lcu;
@ -109,8 +110,10 @@ static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x,
}
static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
{
const uint32_t x = loc->x;
const uint32_t y = loc->y;
const int x_scu = SUB_SCU(x);
const int y_scu = SUB_SCU(y);
@ -132,9 +135,11 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);;
cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);;
uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
*cur_cu = cu_backup;
uint32_t width = loc->width;
uint32_t height = loc->height;
cost = uvg_satd_any_size(width,
width,
@ -162,10 +167,15 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu
}
static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y)
static uint32_t calculate_ibc_cost_sad(ibc_search_info_t *info, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y)
{
const uint32_t x = loc->x;
const uint32_t y = loc->y;
lcu_t *lcu = info->lcu;
cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
const encoder_state_t* state = info->state;
cu_info_t cu_backup = *cur_cu;
uint32_t cost = MAX_INT;
@ -173,6 +183,8 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
const int y_scu = SUB_SCU(y);
const uint32_t offset = x_scu + y_scu * LCU_WIDTH;
const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
const uint32_t width = loc->width;
const uint32_t height = loc->height;
cur_cu->type = CU_IBC;
cur_cu->inter.mv_dir = 1;
@ -183,23 +195,26 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s
cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);;
cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);;
uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400);
uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc);
*cur_cu = cu_backup;
if (optimized_sad != NULL) {
cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
if(state->encoder_control->chroma_format != UVG_CSP_400) {
cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
}
if (info->optimized_sad_y != NULL) {
cost = info->optimized_sad_y(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride);
} else {
cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width,width, LCU_WIDTH, state->tile->frame->source->stride);
if(state->encoder_control->chroma_format != UVG_CSP_400) {
}
// ToDo: Enable chroma cost calculation
/* if (state->encoder_control->chroma_format != UVG_CSP_400) {
if (info->optimized_sad_uv != NULL) {
cost += info->optimized_sad_uv(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
cost += info->optimized_sad_uv(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
} else {
cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2);
}
}
}*/
return cost;
}
@ -235,8 +250,11 @@ static bool check_mv_cost(ibc_search_info_t *info,
double bitcost = 0;
double cost = MAX_DOUBLE;
cu_loc_t loc;
uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height);
cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y);
cost = calculate_ibc_cost_sad(info, &loc, x, y);
if (cost >= *best_cost) return false;
@ -246,7 +264,7 @@ static bool check_mv_cost(ibc_search_info_t *info,
info->mv_cand,
NULL,
0,
NULL,
0,
&bitcost
);
@ -782,39 +800,23 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
* \param amvp Return searched AMVP PUs sorted by costs
* \param merge Return searched Merge PUs sorted by costs
*/
static void search_pu_ibc(encoder_state_t * const state,
int x_cu, int y_cu,
int depth,
part_mode_t part_mode,
int i_pu,
static void search_pu_ibc(
encoder_state_t * const state,
const cu_loc_t * const cu_loc,
unit_stats_map_t *amvp,
unit_stats_map_t *merge,
ibc_search_info_t *info)
{
const uvg_config *cfg = &state->encoder_control->cfg;
const videoframe_t * const frame = state->tile->frame;
const int width_cu = LCU_WIDTH >> depth;
const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
const int width = PU_GET_W(part_mode, width_cu, i_pu);
const int height = PU_GET_H(part_mode, width_cu, i_pu);
// Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
// nRx2N partitions.
const bool merge_a1 = i_pu == 0 || width >= height;
// Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
// 2NxnD partitions.
const bool merge_b1 = i_pu == 0 || width <= height;
const int width_cu = cu_loc->width;
const int height_cu= cu_loc->height;
lcu_t *lcu = info->lcu;
const int x_local = SUB_SCU(x);
const int y_local = SUB_SCU(y);
const int x_local = SUB_SCU(cu_loc->x);
const int y_local = SUB_SCU(cu_loc->y);
cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
cur_pu->type = CU_IBC;
cur_pu->part_size = part_mode;
cur_pu->depth = depth;
cur_pu->tr_depth = depth;
cur_pu->qp = state->qp;
cur_pu->inter.mv_dir = 1;
@ -825,20 +827,20 @@ static void search_pu_ibc(encoder_state_t * const state,
info->state = state;
info->pic = frame->source;
info->origin.x = x;
info->origin.y = y;
info->width = width;
info->height = height;
info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
info->optimized_sad = uvg_get_optimized_sad(width);
info->origin.x = cu_loc->x;
info->origin.y = cu_loc->y;
info->width = width_cu;
info->height = height_cu;
info->mvd_cost_func =
cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
info->optimized_sad_y = uvg_get_optimized_sad(width_cu);
info->optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
info->lcu = lcu;
// Search for merge mode candidates
info->num_merge_cand = uvg_inter_get_merge_cand(
state,
x, y,
width, height,
merge_a1, merge_b1,
cu_loc,
info->merge_cand,
lcu);
@ -853,7 +855,7 @@ static void search_pu_ibc(encoder_state_t * const state,
#ifdef COMPLETE_PRED_MODE_BITS
// Technically counting these bits would be correct, however counting
// them universally degrades quality so this block is disabled by default
const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0);
const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL)], 0);
#else
const double no_skip_flag = 0;
#endif
@ -875,7 +877,7 @@ static void search_pu_ibc(encoder_state_t * const state,
{
continue;
}
uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu);
uvg_inter_pred_pu(state, info->lcu, true, false, cu_loc);
merge->unit[merge->size] = *cur_pu;
merge->unit[merge->size].type = CU_IBC;
merge->unit[merge->size].merge_idx = merge_idx;
@ -883,11 +885,11 @@ static void search_pu_ibc(encoder_state_t * const state,
merge->unit[merge->size].skipped = false;
double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
if(state->encoder_control->cfg.rdo >= 2) {
uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
}
else {
merge->cost[merge->size] = uvg_satd_any_size(width, height,
merge->cost[merge->size] = uvg_satd_any_size(width_cu, height_cu,
lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
bits += no_skip_flag;
@ -909,7 +911,7 @@ static void search_pu_ibc(encoder_state_t * const state,
// Early Skip Mode Decision
bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
if (cfg->early_skip) {
for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
merge->size = 1;
@ -919,6 +921,7 @@ static void search_pu_ibc(encoder_state_t * const state,
merge->keys[0] = 0;
}
else if(cfg->rdo < 2) {
const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
// Reconstruct blocks with merge candidate.
// Check luma CBF. Then, check chroma CBFs if luma CBF is not set
// and chroma exists.
@ -927,19 +930,18 @@ static void search_pu_ibc(encoder_state_t * const state,
cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir;
cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0];
cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1];
uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
continue;
}
else if (has_chroma) {
uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
uvg_quantize_lcu_residual(state, false, has_chroma,
false, /*we are only checking for lack of coeffs so no need to check jccr*/
x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
if (!cbf_is_set_any(cur_pu->cbf, depth)) {
cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
if (!cbf_is_set_any(cur_pu->cbf)) {
cur_pu->type = CU_IBC;
cur_pu->merge_idx = merge_idx;
cur_pu->skipped = true;
@ -965,14 +967,11 @@ static void search_pu_ibc(encoder_state_t * const state,
// Do the motion search
uvg_inter_get_mv_cand(info->state,
info->origin.x,
info->origin.y,
info->width,
info->height,
info->mv_cand,
cur_pu,
lcu,
NULL);
0,
cu_loc);
vector2d_t best_mv = { 0, 0 };
@ -1003,9 +1002,7 @@ static void search_pu_ibc(encoder_state_t * const state,
best_cost = calculate_ibc_cost_satd(
info->state,
lcu,
info->origin.x,
info->origin.y,
info->width,
cu_loc,
(best_mv.x >> INTERNAL_MV_PREC),
(best_mv.y >> INTERNAL_MV_PREC));
best_cost += best_bits * info->state->lambda;
@ -1052,16 +1049,16 @@ static void search_pu_ibc(encoder_state_t * const state,
};
if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
if (state->encoder_control->cfg.rdo >= 2) {
if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);
}
if(cfg->rdo < 2) {
int predmode_ctx;
const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3;
const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@ -1077,33 +1074,29 @@ static void search_pu_ibc(encoder_state_t * const state,
#include "threads.h"
static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
int x, int y, int depth,
const cu_loc_t* cu_loc,
lcu_t* lcu,
double* inter_cost,
double* inter_bitcost)
{
const int x_cu = x;
const int y_cu = y;
const int x_cu = cu_loc->x;
const int y_cu = cu_loc->y;
const int part_mode = SIZE_2Nx2N;
const uvg_config *cfg = &state->encoder_control->cfg;
const videoframe_t * const frame = state->tile->frame;
const int width_cu = LCU_WIDTH >> depth;
const int width = PU_GET_W(part_mode, width_cu, 0);
const int height = PU_GET_H(part_mode, width_cu, 0);
const int width_cu = cu_loc->width;
const int height_cu = cu_loc->height;
const bool merge_a1 = true;
const bool merge_b1 = true;
ibc_search_info_t info;
const int x_local = SUB_SCU(x);
const int y_local = SUB_SCU(y);
const int x_local = SUB_SCU(x_cu);
const int y_local = SUB_SCU(y_cu);
cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
cur_pu->type = CU_IBC;
cur_pu->part_size = part_mode;
cur_pu->depth = depth;
cur_pu->tr_depth = depth;
cur_pu->qp = state->qp;
// Default to candidate 0
@ -1113,24 +1106,20 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
info.state = state;
info.pic = frame->source;
info.origin.x = x;
info.origin.y = y;
info.width = width;
info.height = height;
info.origin.x = cu_loc->x;
info.origin.y = cu_loc->y;
info.width = width_cu;
info.height = height_cu;
info.mvd_cost_func =
cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost;
info.optimized_sad = uvg_get_optimized_sad(width);
info.optimized_sad_y = uvg_get_optimized_sad(width_cu);
info.optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width);
info.lcu = lcu;
// Search for merge mode candidates
info.num_merge_cand = uvg_inter_get_merge_cand(
state,
x,
y,
width,
height,
merge_a1,
merge_b1,
cu_loc,
info.merge_cand,
lcu);
@ -1145,17 +1134,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
static int evaluations = 0;
static int hits = 0;
UVG_CLOCK_T hashmap_start_temp;
UVG_CLOCK_T hashmap_end_temp;
UVG_CLOCK_T hashmap_start_real_time;
UVG_CLOCK_T hashmap_end_real_time;
UVG_GET_TIME(&hashmap_start_real_time);
int xx = x;
int yy = y;
int xx = x_cu;
int yy = y_cu;
int best_mv_x = INT_MAX>>2;
int best_mv_y = INT_MAX>>2;
@ -1185,12 +1169,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
int pos_y = result->value & 0xffff;
int mv_x = pos_x - xx;
int mv_y = pos_y - yy;
if (pos_x <= xx - width && pos_y <= yy - height) {
if (pos_x <= xx - width_cu && pos_y <= yy - height_cu) {
valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y);
if (valid_mv) {
bool full_block = true; // Is the full block covered by the IBC?
for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) {
for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width_cu; offset_x+=UVG_HASHMAP_BLOCKSIZE) {
for (int offset_y = 0; offset_y < height_cu; offset_y += UVG_HASHMAP_BLOCKSIZE) {
uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[
((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE];
@ -1211,7 +1195,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
if (full_block) {
double cost = ibc_cost, bits = ibc_bitcost;
vector2d_t mv = { best_mv_x, best_mv_y};
cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, NULL, &bits);
cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, 0, &bits);
//double cost = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt;
//cost +=
bool better_mv = cost < ibc_cost;
@ -1220,7 +1204,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
best_mv_y = mv_y;
ibc_cost = cost;
ibc_bitcost = bits;
fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y);
fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x_cu,y_cu, width_cu,height_cu, mv_x, mv_y);
found_block = true;
//break;
}
@ -1238,7 +1222,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
//if (x > state->tile->frame->width-64 && y > state->tile->frame->height-64)
//fprintf(stderr, "Hashmap time: %f (crc: %f, search: %f) Evaluations: %d Hits: %d, hashed in this block: %d\n", time_spent,crc_time, search_time, evaluations, hits,hashes_found);
if (!found_block) return;
if (!found_block) return 0;
*inter_cost = ibc_cost;
*inter_bitcost = ibc_bitcost;
@ -1267,18 +1251,16 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
cur_pu->skipped = merged;
const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1);
ibc_cost += ibc_flag * state->lambda;
ibc_bitcost += ibc_flag;
uvg_inter_recon_cu(
state,
lcu,
x,
y,
CU_WIDTH_FROM_DEPTH(depth),
true,
state->encoder_control->chroma_format != UVG_CSP_400);
state->encoder_control->chroma_format != UVG_CSP_400,
cu_loc);
if (*inter_cost < MAX_DOUBLE) {
assert(fracmv_within_ibc_range(
@ -1286,7 +1268,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
cur_pu->inter.mv[0][0],
cur_pu->inter.mv[0][1]));
}
return 1;
}
@ -1305,17 +1287,18 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state,
* \param inter_bitcost Return inter bitcost
*/
void uvg_search_cu_ibc(encoder_state_t * const state,
int x, int y, int depth,
const cu_loc_t * const cu_loc,
lcu_t *lcu,
double *inter_cost,
double* inter_bitcost)
{
*inter_cost = MAX_DOUBLE;
*inter_bitcost = MAX_INT;
// Quick hashmap search
/* uvg_search_hash_cu_ibc(
state,
x, y, depth,
cu_loc,
lcu,
inter_cost,
inter_bitcost);
@ -1330,8 +1313,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
info.lcu = lcu;
search_pu_ibc(state,
x, y, depth,
SIZE_2Nx2N, 0,
cu_loc,
amvp,
&merge,
&info);
@ -1374,14 +1356,14 @@ void uvg_search_cu_ibc(encoder_state_t * const state,
return;
}
const int x_local = SUB_SCU(x);
const int y_local = SUB_SCU(y);
const int x_local = SUB_SCU(cu_loc->x);
const int y_local = SUB_SCU(cu_loc->y);
cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
*cur_pu = *best_inter_pu;
cur_pu->type = CU_IBC;
uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
true, state->encoder_control->chroma_format != UVG_CSP_400);
uvg_inter_recon_cu(state, lcu,
true, state->encoder_control->chroma_format != UVG_CSP_400, cu_loc);
if (*inter_cost < MAX_DOUBLE) {
assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));

View file

@ -46,7 +46,7 @@
void uvg_search_cu_ibc(encoder_state_t * const state,
int x, int y, int depth,
const cu_loc_t * const cu_loc,
lcu_t *lcu,
double *inter_cost,
double* inter_bitcost);

View file

@ -1293,8 +1293,8 @@ static void apply_mv_scaling(int32_t current_poc,
/**
* \brief Perform inter search for a single reference frame.
*/
static void search_pu_inter_ref(inter_search_info_t *info,
int depth,
static void search_pu_inter_ref(
inter_search_info_t *info,
lcu_t *lcu,
cu_info_t *cur_cu,
unit_stats_map_t *amvp)
@ -1327,15 +1327,15 @@ static void search_pu_inter_ref(inter_search_info_t *info,
// Get MV candidates
cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list];
cu_loc_t cu_loc;
uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
uvg_inter_get_mv_cand(info->state,
info->origin.x,
info->origin.y,
info->width,
info->height,
info->mv_cand,
cur_cu,
lcu,
ref_list);
ref_list,
&cu_loc);
vector2d_t best_mv = { 0, 0 };
@ -1498,11 +1498,13 @@ static void search_pu_inter_ref(inter_search_info_t *info,
/**
* \brief Search bipred modes for a PU.
*/
static void search_pu_inter_bipred(inter_search_info_t *info,
int depth,
static void search_pu_inter_bipred(
inter_search_info_t *info,
lcu_t *lcu,
unit_stats_map_t *amvp_bipred)
{
cu_loc_t cu_loc;
uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height);
const image_list_t *const ref = info->state->frame->ref;
uint8_t (*ref_LX)[16] = info->state->frame->ref_LX;
const videoframe_t * const frame = info->state->tile->frame;
@ -1551,7 +1553,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
bipred_pu->skipped = false;
for (int reflist = 0; reflist < 2; reflist++) {
uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, &cu_loc);
}
// Don't try merge candidates that don't satisfy mv constraints.
@ -1564,13 +1566,11 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
uvg_inter_recon_bipred(info->state,
ref->images[ref_LX[0][merge_cand[i].ref[0]]],
ref->images[ref_LX[1][merge_cand[j].ref[1]]],
x, y,
width,
height,
mv,
lcu,
true,
false);
false,
&cu_loc);
const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride];
@ -1666,11 +1666,9 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
* \param amvp Return searched AMVP PUs sorted by costs
* \param merge Return searched Merge PUs sorted by costs
*/
static void search_pu_inter(encoder_state_t * const state,
int x_cu, int y_cu,
int depth,
part_mode_t part_mode,
int i_pu,
static void search_pu_inter(
encoder_state_t * const state,
const cu_loc_t* const cu_loc,
lcu_t *lcu,
unit_stats_map_t *amvp,
unit_stats_map_t *merge,
@ -1678,25 +1676,14 @@ static void search_pu_inter(encoder_state_t * const state,
{
const uvg_config *cfg = &state->encoder_control->cfg;
const videoframe_t * const frame = state->tile->frame;
const int width_cu = LCU_WIDTH >> depth;
const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
const int width = PU_GET_W(part_mode, width_cu, i_pu);
const int height = PU_GET_H(part_mode, width_cu, i_pu);
const int width_cu = cu_loc->width;
const int height_cu = cu_loc->height;
// Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
// nRx2N partitions.
const bool merge_a1 = i_pu == 0 || width >= height;
// Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and
// 2NxnD partitions.
const bool merge_b1 = i_pu == 0 || width <= height;
const int x_local = SUB_SCU(x);
const int y_local = SUB_SCU(y);
const int x_local = SUB_SCU(cu_loc->x);
const int y_local = SUB_SCU(cu_loc->y);
cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
cur_pu->type = CU_NOTSET;
cur_pu->part_size = part_mode;
cur_pu->depth = depth;
cur_pu->qp = state->qp;
// Default to candidate 0
@ -1707,19 +1694,17 @@ static void search_pu_inter(encoder_state_t * const state,
info->state = state;
info->pic = frame->source;
info->origin.x = x;
info->origin.y = y;
info->width = width;
info->height = height;
info->origin.x = cu_loc->x;
info->origin.y = cu_loc->y;
info->width = width_cu;
info->height = height_cu;
info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost;
info->optimized_sad = uvg_get_optimized_sad(width);
info->optimized_sad = uvg_get_optimized_sad(width_cu);
// Search for merge mode candidates
info->num_merge_cand = uvg_inter_get_merge_cand(
state,
x, y,
width, height,
merge_a1, merge_b1,
cu_loc,
info->merge_cand,
lcu
);
@ -1754,7 +1739,7 @@ static void search_pu_inter(encoder_state_t * const state,
// If bipred is not enabled, do not try candidates with mv_dir == 3.
// Bipred is also forbidden for 4x8 and 8x4 blocks by the standard.
if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue;
if (cur_pu->inter.mv_dir == 3 && !(cu_loc->width + cu_loc->height > 12)) continue;
bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge);
@ -1768,7 +1753,7 @@ static void search_pu_inter(encoder_state_t * const state,
{
continue;
}
uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
uvg_inter_pred_pu(state, lcu, true, false, cu_loc);
merge->unit[merge->size] = *cur_pu;
merge->unit[merge->size].type = CU_INTER;
merge->unit[merge->size].merge_idx = merge_idx;
@ -1776,11 +1761,11 @@ static void search_pu_inter(encoder_state_t * const state,
merge->unit[merge->size].skipped = false;
double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
if(state->encoder_control->cfg.rdo >= 2) {
uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc);
}
else {
merge->cost[merge->size] = uvg_satd_any_size(width, height,
merge->cost[merge->size] = uvg_satd_any_size(cu_loc->width, cu_loc->height,
lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
bits += no_skip_flag;
@ -1802,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state,
// Early Skip Mode Decision
bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
if (cfg->early_skip) {
for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
merge->size = 1;
@ -1812,6 +1797,8 @@ static void search_pu_inter(encoder_state_t * const state,
merge->keys[0] = 0;
}
else if(cfg->rdo < 2) {
const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
// Reconstruct blocks with merge candidate.
// Check luma CBF. Then, check chroma CBFs if luma CBF is not set
// and chroma exists.
@ -1824,22 +1811,22 @@ static void search_pu_inter(encoder_state_t * const state,
cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1];
cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0];
cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1];
uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T);
uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T);
uvg_inter_recon_cu(state, lcu, true, false, cu_loc);
if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T);
if (cbf_is_set(cur_pu->cbf, COLOR_Y)) {
continue;
}
else if (has_chroma) {
uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc);
uvg_quantize_lcu_residual(state,
false, has_chroma,
false, /*we are only checking for lack of coeffs so no need to check jccr*/
x, y, depth, cur_pu, lcu,
cu_loc, cur_pu, lcu,
true,
UVG_BOTH_T);
if (!cbf_is_set_any(cur_pu->cbf, depth)) {
if (!cbf_is_set_any(cur_pu->cbf)) {
cur_pu->type = CU_INTER;
cur_pu->merge_idx = merge_idx;
cur_pu->skipped = true;
@ -1871,7 +1858,7 @@ static void search_pu_inter(encoder_state_t * const state,
info->ref_idx = ref_idx;
info->ref = state->frame->ref->images[ref_idx];
search_pu_inter_ref(info, depth, lcu, cur_pu, amvp);
search_pu_inter_ref(info, lcu, cur_pu, amvp);
}
assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE);
@ -1936,14 +1923,11 @@ static void search_pu_inter(encoder_state_t * const state,
info->ref = ref->images[info->ref_idx];
uvg_inter_get_mv_cand(info->state,
info->origin.x,
info->origin.y,
info->width,
info->height,
info->mv_cand,
unipred_pu,
lcu,
list);
list,
cu_loc);
double frac_cost = MAX_DOUBLE;
double frac_bits = MAX_INT;
@ -1964,8 +1948,8 @@ static void search_pu_inter(encoder_state_t * const state,
unipred_pu->inter.mv[list][1] = frac_mv.y;
CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand);
if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits);
if (state->encoder_control->cfg.rdo >= 2) {
uvg_cu_cost_inter_rd2(state, unipred_pu, lcu, &frac_cost, &frac_bits, cu_loc);
}
amvp[list].cost[key] = frac_cost;
@ -1987,15 +1971,15 @@ static void search_pu_inter(encoder_state_t * const state,
amvp[list].size = n_best;
}
if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) {
if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]);
if (state->encoder_control->cfg.rdo >= 2 && cfg->fme_level == 0) {
if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc);
if (amvp[1].size) uvg_cu_cost_inter_rd2(state, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]], cu_loc);
}
// Search bi-pred positions
bool can_use_bipred = state->frame->slicetype == UVG_SLICE_B
&& cfg->bipred
&& width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
&& cu_loc->width + cu_loc->height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
if (can_use_bipred) {
@ -2026,25 +2010,23 @@ static void search_pu_inter(encoder_state_t * const state,
bipred_pu->skipped = false;
for (int reflist = 0; reflist < 2; reflist++) {
uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, cu_loc);
}
uvg_inter_recon_bipred(info->state,
ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
x, y,
width,
height,
mv,
lcu,
mv, lcu,
true,
false);
false,
cu_loc
);
const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
const uvg_pixel *src = &lcu->ref.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)];
best_bipred_cost =
uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
uvg_satd_any_size(cu_loc->width, cu_loc->height, rec, LCU_WIDTH, src, LCU_WIDTH);
double bitcost[2] = { 0, 0 };
@ -2091,17 +2073,17 @@ static void search_pu_inter(encoder_state_t * const state,
}
// TODO: this probably should have a separate command line option
if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]);
if (cfg->rdo >= 3) search_pu_inter_bipred(info, lcu, &amvp[2]);
assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE);
uvg_sort_keys_by_cost(&amvp[2]);
if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]);
if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2) {
uvg_cu_cost_inter_rd2(state, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]], cu_loc);
}
}
if(cfg->rdo < 2) {
int predmode_ctx;
const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx);
const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
@ -2135,22 +2117,19 @@ static void search_pu_inter(encoder_state_t * const state,
* \param inter_cost Return inter cost
* \param inter_bitcost Return inter bitcost
*/
void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
int x, int y, int depth,
void uvg_cu_cost_inter_rd2(
encoder_state_t * const state,
cu_info_t* cur_cu,
lcu_t *lcu,
double *inter_cost,
double* inter_bitcost){
double* inter_bitcost,
const cu_loc_t* const cu_loc){
int tr_depth = MAX(1, depth);
if (cur_cu->part_size != SIZE_2Nx2N) {
tr_depth = depth + 1;
}
uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, UVG_BOTH_T);
const int x_px = SUB_SCU(cu_loc->x);
const int y_px = SUB_SCU(cu_loc->y);
const int width = cu_loc->width;
const int height = cu_loc->height;
const int x_px = SUB_SCU(x);
const int y_px = SUB_SCU(y);
const int width = LCU_WIDTH >> depth;
cabac_data_t cabac_copy;
memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
cabac_data_t* cabac = &state->search_cabac;
@ -2160,31 +2139,43 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
*cur_pu = *cur_cu;
const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
uvg_inter_recon_cu(state, lcu, true, reconstruct_chroma, cu_loc);
int index = y_px * LCU_WIDTH + x_px;
double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
LCU_WIDTH, LCU_WIDTH,
width) * UVG_LUMA_MULT;
width, height) * UVG_LUMA_MULT;
if (reconstruct_chroma) {
int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C,
width / 2);
cu_loc->chroma_width, cu_loc->chroma_height);
double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C,
width / 2);
cu_loc->chroma_width, cu_loc->chroma_height);
ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
}
double no_cbf_bits;
double bits = 0;
const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL);
if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
int8_t depth = 0;
int8_t mtt_depth = 0;
uint32_t splits = cur_cu->split_tree;
while (splits & 7) {
if ((splits & 7) != QT_SPLIT) {
mtt_depth++;
}
depth++;
splits >>= 3;
}
const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth, 0};
if (cur_cu->merged) {
no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
bits += uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
}
else {
no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T);
no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
}
double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
@ -2194,20 +2185,20 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
state->encoder_control->cfg.chroma_trskip_enable;
double chroma_cost = 0;
if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && cur_cu->depth == cur_cu->tr_depth && reconstruct_chroma) {
if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && PU_IS_TU(cur_cu) && reconstruct_chroma) {
uvg_quantize_lcu_residual(state,
true,
false,
false, x, y,
depth,
false,
cu_loc,
cur_cu,
lcu,
false,
UVG_BOTH_T);
ALIGNED(64) uvg_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
ALIGNED(64) uvg_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, width, LCU_WIDTH_C, width);
uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, width, LCU_WIDTH_C, width);
uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, height, LCU_WIDTH_C, width);
uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, height, LCU_WIDTH_C, width);
ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
@ -2216,6 +2207,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
u_pred,
u_resi,
width,
height,
LCU_WIDTH_C,
width);
uvg_generate_residual(
@ -2223,19 +2215,17 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
v_pred,
v_resi,
width,
height,
LCU_WIDTH_C,
width);
uvg_chorma_ts_out_t chorma_ts_out;
uvg_chroma_transform_search(
state,
depth,
lcu,
&cabac_copy,
width,
width,
cu_loc,
index,
0,
cur_cu,
u_pred,
v_pred,
@ -2243,41 +2233,41 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
v_resi,
&chorma_ts_out,
UVG_BOTH_T);
cbf_clear(&cur_cu->cbf, depth, COLOR_U);
cbf_clear(&cur_cu->cbf, depth, COLOR_V);
cbf_clear(&cur_cu->cbf, COLOR_U);
cbf_clear(&cur_cu->cbf, COLOR_V);
if (chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) {
cur_cu->joint_cb_cr = 0;
cur_cu->tr_skip |= (chorma_ts_out.best_u_index == CHROMA_TS) << COLOR_U;
cur_cu->tr_skip |= (chorma_ts_out.best_v_index == CHROMA_TS) << COLOR_V;
if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_U);
if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_V);
if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_U);
if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_V);
chroma_cost += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost;
}
else {
cur_cu->joint_cb_cr = chorma_ts_out.best_combined_index;
if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, depth, COLOR_U);
if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, depth, COLOR_V);
if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, COLOR_U);
if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, COLOR_V);
chroma_cost += chorma_ts_out.best_combined_cost;
}
}
else {
uvg_quantize_lcu_residual(state,
true, reconstruct_chroma,
reconstruct_chroma && state->encoder_control->cfg.jccr, x, y,
depth,
reconstruct_chroma && state->encoder_control->cfg.jccr,
cu_loc,
cur_cu,
lcu,
false,
UVG_BOTH_T);
}
int cbf = cbf_is_set_any(cur_cu->cbf, depth);
int cbf = cbf_is_set_any(cur_cu->cbf);
if(cbf) {
*inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu);
*inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
if (reconstruct_chroma) {
if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
*inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
if (!PU_IS_TU(cur_cu) || !state->encoder_control->cfg.jccr) {
*inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
}
else {
*inter_cost += chroma_cost;
@ -2297,7 +2287,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
if(no_cbf_cost < *inter_cost) {
cur_cu->cbf = 0;
if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
if (cur_cu->merged) {
cur_cu->skipped = 1;
}
*inter_cost = no_cbf_cost;
@ -2321,8 +2311,9 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
* \param inter_cost Return inter cost
* \param inter_bitcost Return inter bitcost
*/
void uvg_search_cu_inter(encoder_state_t * const state,
int x, int y, int depth,
void uvg_search_cu_inter(
encoder_state_t * const state,
const cu_loc_t* const cu_loc,
lcu_t *lcu,
double *inter_cost,
double* inter_bitcost)
@ -2338,12 +2329,8 @@ void uvg_search_cu_inter(encoder_state_t * const state,
inter_search_info_t info;
search_pu_inter(state,
x, y, depth,
SIZE_2Nx2N, 0,
lcu,
amvp,
&merge,
&info);
cu_loc, lcu, amvp,
&merge, &info);
// Early Skip CU decision
if (merge.size == 1 && merge.unit[0].skipped) {
@ -2385,13 +2372,14 @@ void uvg_search_cu_inter(encoder_state_t * const state,
return;
}
const int x_local = SUB_SCU(x);
const int y_local = SUB_SCU(y);
const int x_local = SUB_SCU(cu_loc->x);
const int y_local = SUB_SCU(cu_loc->y);
cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
*cur_pu = *best_inter_pu;
uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
true, state->encoder_control->chroma_format != UVG_CSP_400);
uvg_inter_recon_cu(state, lcu,
true, state->encoder_control->chroma_format != UVG_CSP_400,
cu_loc);
if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));

View file

@ -73,8 +73,9 @@ typedef double uvg_mvd_cost_func(const encoder_state_t *state,
int32_t ref_idx,
double *bitcost);
void uvg_search_cu_inter(encoder_state_t * const state,
int x, int y, int depth,
void uvg_search_cu_inter(
encoder_state_t * const state,
const cu_loc_t* const cu_loc,
lcu_t *lcu,
double *inter_cost,
double* inter_bitcost);
@ -85,12 +86,13 @@ unsigned uvg_inter_satd_cost(const encoder_state_t* state,
const lcu_t *lcu,
int x,
int y);
void uvg_cu_cost_inter_rd2(encoder_state_t* const state,
int x, int y, int depth,
void uvg_cu_cost_inter_rd2(
encoder_state_t* const state,
cu_info_t* cur_cu,
lcu_t* lcu,
double* inter_cost,
double* inter_bitcost);
double* inter_bitcost,
const cu_loc_t* const cu_loc);
int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);

File diff suppressed because it is too large Load diff

View file

@ -43,27 +43,27 @@
#include "global.h" // IWYU pragma: keep
#include "intra.h"
double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu);
double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t*
const cu_loc,
const lcu_t* lcu);
double uvg_chroma_mode_bits(const encoder_state_t *state,
int8_t chroma_mode, int8_t luma_mode);
int8_t uvg_search_cu_intra_chroma(
encoder_state_t * const state,
const int x_px,
const int y_px,
const int depth,
const cu_loc_t* const cu_loc,
lcu_t *lcu,
intra_search_data_t* best_cclm,
enum uvg_tree_type tree_type);
int8_t luma_mode,
enum uvg_tree_type tree_type,
bool is_separate);
void uvg_search_cu_intra(
encoder_state_t * const state,
const int x_px,
const int y_px,
const int depth,
intra_search_data_t* search_data,
lcu_t *lcu,
enum uvg_tree_type tree_type);
enum uvg_tree_type tree_type,
const cu_loc_t* const cu_loc);
#endif // SEARCH_INTRA_H_

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
#ifndef STRATEGIES_DEPQUANT_AVX2_H_
#define STRATEGIES_DEPQUANT_AVX2_H_
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Optimizations for AVX2.
*/
#include "global.h" // IWYU pragma: keep
int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth);
#endif //STRATEGIES_DEPQUANT_AVX2_H_

View file

@ -38,13 +38,14 @@
* Functions for writing the coding quadtree and related syntax.
*/
#include "cu.h"
#include "encoderstate.h"
#include "global.h"
void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
uint8_t width,
const cu_loc_t *loc,
uint8_t type,
int8_t scan_mode,
int8_t tr_skip,

View file

@ -42,10 +42,9 @@
#include "strategyselector.h"
#include "strategies/missing-intel-intrinsics.h"
/**
* \brief Generate angular predictions.
* \param log2_width Log2 of width, range 2..5.
* \param cu_loc CU locationand size data.
* \param intra_mode Angular mode in range 2..34.
* \param channel_type Color channel.
* \param in_ref_above Pointer to -1 index of above reference, length=width*2+1.
@ -54,20 +53,28 @@
* \param multi_ref_idx Reference line index for use with MRL.
*/
static void uvg_angular_pred_avx2(
const int_fast8_t log2_width,
const cu_loc_t* const cu_loc,
const int_fast8_t intra_mode,
const int_fast8_t channel_type,
const uvg_pixel *const in_ref_above,
const uvg_pixel *const in_ref_left,
uvg_pixel *const dst,
const uint8_t multi_ref_idx)
const uint8_t multi_ref_idx,
const uint8_t isp_mode,
const int cu_dim)
{
// ISP_TODO: non-square block implementation, height is passed but not used
const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int log2_width = uvg_g_convert_to_log2[width];
const int log2_height = uvg_g_convert_to_log2[height];
assert(log2_width >= 2 && log2_width <= 5);
assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
assert(intra_mode >= 2 && intra_mode <= 66);
// TODO: implement handling of MRL
uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0;
uint8_t isp = isp_mode;
__m256i p_shuf_01 = _mm256_setr_epi8(
0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04,
@ -142,7 +149,6 @@ static void uvg_angular_pred_avx2(
//uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 };
uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
const int_fast32_t width = 1 << log2_width;
int32_t pred_mode = intra_mode; // ToDo: handle WAIP
@ -345,13 +351,13 @@ static void uvg_angular_pred_avx2(
// PDPC
bool PDPC_filter = (width >= 4 || channel_type != 0);
bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0);
if (pred_mode > 1 && pred_mode < 67) {
if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
PDPC_filter = false;
}
else if (mode_disp > 0) {
PDPC_filter = (scale >= 0);
PDPC_filter &= (scale >= 0);
}
}
if(PDPC_filter) {
@ -497,20 +503,27 @@ static void uvg_angular_pred_avx2(
/**
* \brief Generate planar prediction.
* \param log2_width Log2 of width, range 2..5.
* \param cu_loc CU location and size data.
* \param color Color channel.
* \param in_ref_above Pointer to -1 index of above reference, length=width*2+1.
* \param in_ref_left Pointer to -1 index of left reference, length=width*2+1.
* \param dst Buffer of size width*width.
*/
static void uvg_intra_pred_planar_avx2(
const int_fast8_t log2_width,
const cu_loc_t* const cu_loc,
color_t color,
const uint8_t *const ref_top,
const uint8_t *const ref_left,
uint8_t *const dst)
{
assert(log2_width >= 2 && log2_width <= 5);
// ISP_TODO: non-square block implementation, height is passed but not used
const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int log2_width = uvg_g_convert_to_log2[width];
const int log2_height = uvg_g_convert_to_log2[height];
assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
const int_fast8_t width = 1 << log2_width;
const uint8_t top_right = ref_top[width + 1];
const uint8_t bottom_left = ref_left[width + 1];
@ -964,12 +977,17 @@ static void uvg_intra_pred_filtered_dc_avx2(
*/
static void uvg_pdpc_planar_dc_avx2(
const int mode,
const int width,
const int log2_width,
const cu_loc_t* const cu_loc,
const color_t color,
const uvg_intra_ref *const used_ref,
uvg_pixel *const dst)
{
// ISP_TODO: non-square block implementation, height is passed but not used
assert(mode == 0 || mode == 1); // planar or DC
const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int log2_width = uvg_g_convert_to_log2[width];
const int log2_height = uvg_g_convert_to_log2[height];
__m256i shuf_mask_byte = _mm256_setr_epi8(
0, -1, 0, -1, 0, -1, 0, -1,

View file

@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
const int ref_stride, const int rec_stride,
const int width)
const int width, const int height)
{
assert(width == height && "Non square not yet implemented");
__m256i ssd_part;
__m256i diff = _mm256_setzero_si256();
__m128i sum;
@ -1743,40 +1744,32 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
return diff;
}
static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {
static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) {
// ISP_TODO: non-square block implementation, height is passed but not used
__m128i diff = _mm_setzero_si128();
switch (width) {
case 4:
diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[0]), diff);
diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[4]), diff);
diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[8]), diff);
diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[12]), diff);
for (int y = 0; y < height; y+=4) {
diff = get_residual_4x1_avx2(ref_in + y * ref_stride, pred_in + y * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[y * 4]), diff);
diff = get_residual_4x1_avx2(ref_in + (y + 1) * ref_stride, pred_in + (y + 1) * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[y * 4 + 4]), diff);
diff = get_residual_4x1_avx2(ref_in + (y + 2) * ref_stride, pred_in + (y + 2) * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[y * 4 + 8]), diff);
diff = get_residual_4x1_avx2(ref_in + (y + 3) * ref_stride, pred_in + (y + 3) * pred_stride);
_mm_storel_epi64((__m128i*) & (residual[y * 4 + 12]), diff);
}
break;
case 8:
diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[0]), diff);
diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[8]), diff);
diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[16]), diff);
diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[24]), diff);
diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[32]), diff);
diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[40]), diff);
diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[48]), diff);
diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[56]), diff);
for (int y = 0; y < height; y += 2) {
diff = get_residual_8x1_avx2(&ref_in[y * ref_stride], &pred_in[y * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[y * 8]), diff);
diff = get_residual_8x1_avx2(&ref_in[(y + 1) * ref_stride], &pred_in[(y + 1) * pred_stride]);
_mm_storeu_si128((__m128i*) & (residual[y*8 + 8]), diff);
}
break;
default:
for (int y = 0; y < width; ++y) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; x += 16) {
diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]);
_mm_storeu_si128((__m128i*) & residual[x + y * width], diff);

View file

@ -380,20 +380,24 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx)
{
const encoder_control_t * const encoder = state->encoder_control;
const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];
const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
uint32_t log2_tr_width = uvg_math_floor_log2(height);
uint32_t log2_tr_height = uvg_math_floor_log2(width);
bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
needs_block_size_trafo_scale |= 0; // Non log2 block size
const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
const int32_t q_bits8 = q_bits - 8;
const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
uint32_t ac_sum = 0;
int32_t last_cg = -1;
@ -402,7 +406,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
// Loading once is enough if scaling lists are not off
__m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256();
if (!(state->encoder_control->scaling_list.enable)) {
low_b = _mm256_set1_epi32(quant_coeff[0]);
low_b = _mm256_set1_epi32(default_quant_coeff);
high_b = low_b;
}
@ -579,34 +583,61 @@ static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint
return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec));
}
static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){
static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width, int height){
if (height == width || width >= 16) {
switch (width) {
case 4:
*(int32_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
*(int32_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
*(int32_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
*(int32_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
*(int32_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
*(int32_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
*(int32_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
*(int32_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
break;
case 8:
*(int64_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
*(int64_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
*(int64_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
*(int64_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
*(int64_t*)&(rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride);
*(int64_t*)&(rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride);
*(int64_t*)&(rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride);
*(int64_t*)&(rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
*(int64_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride);
*(int64_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride);
*(int64_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride);
*(int64_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride);
*(int64_t*)& (rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride);
*(int64_t*)& (rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride);
*(int64_t*)& (rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride);
*(int64_t*)& (rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride);
break;
default:
for (int y = 0; y < width; ++y) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; x += 16) {
*(int64_t*)&(rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride);
*(int64_t*)&(rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride);
*(int64_t*)& (rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride);
*(int64_t*)& (rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride);
}
}
break;
}
}
else {
switch (width) {
case 4:
for (int y = 0; y < height; y += 4) {
*(int32_t*)& (rec_out[(y + 0) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 0) * width, pred_in + (y + 0) * in_stride);
*(int32_t*)& (rec_out[(y + 1) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 1) * width, pred_in + (y + 1) * in_stride);
*(int32_t*)& (rec_out[(y + 2) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 2) * width, pred_in + (y + 2) * in_stride);
*(int32_t*)& (rec_out[(y + 3) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 3) * width, pred_in + (y + 3) * in_stride);
}
break;
case 8:
for (int y = 0; y < height; ++y) {
*(int32_t*)& (rec_out[y * out_stride]) = get_quantized_recon_8x1_avx2(residual + y * width, pred_in + y * in_stride);
}
break;
default:
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
}
}
break;
}
}
}
/**
@ -626,7 +657,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in,
* \returns Whether coeff_out contains any non-zero coefficients.
*/
int uvg_quantize_residual_avx2(encoder_state_t *const state,
const cu_info_t *const cur_cu, const int width, const color_t color,
const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
const coeff_scan_order_t scan_order, const int use_trskip,
const int in_stride, const int out_stride,
const uint8_t *const ref_in, const uint8_t *const pred_in,
@ -637,15 +668,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
// Temporary arrays to pass data to and from uvg_quant and transform functions.
ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
// ISP_TODO: non-square block implementation, height is passed but not used
const int height = width; // TODO: height for non-square blocks
int has_coeffs = 0;
assert(width <= TR_MAX_WIDTH);
assert(width >= TR_MIN_WIDTH);
// Get residual. (ref_in - pred_in -> residual)
uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
int y, x;
@ -662,40 +693,51 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
// Transform residual. (residual -> coeff)
if (use_trskip) {
uvg_transformskip(state->encoder_control, residual, coeff, width);
uvg_transformskip(state->encoder_control, residual, coeff, width, height);
}
else {
uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
}
const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
// Forward low frequency non-separable transform
uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
}
// Quantize coeffs. (coeff -> coeff_out)
if (state->encoder_control->cfg.rdoq_enable &&
int abs_sum = 0;
if(!use_trskip && state->encoder_control->cfg.dep_quant) {
uvg_dep_quant(
state,
cur_cu,
width,
height,
coeff,
coeff_out,
color,
tree_type,
&abs_sum,
state->encoder_control->cfg.scaling_list);
}
else if (state->encoder_control->cfg.rdoq_enable &&
(width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
uvg_rdoq(state, coeff, coeff_out, width, width, color,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
uvg_rdoq(state, coeff, coeff_out, width, height, color,
scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
}
else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
scan_order);
}
else {
uvg_quant(state, coeff, coeff_out, width, width, color,
uvg_quant(state, coeff, coeff_out, width, height, color,
scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
}
// Check if there are any non-zero coefficients.
for (int i = 0; i < width * width; i += 8) {
for (int i = 0; i < width * height; i += 8) {
__m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(coeff_out[i]));
has_coeffs = !_mm_testz_si128(_mm_set1_epi8(0xFF), v_quant_coeff);
if(has_coeffs) break;
@ -705,25 +747,25 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
// rec_out.
if (has_coeffs && !early_skip) {
// Get quantized residual. (coeff_out -> coeff -> residual)
uvg_dequant(state, coeff_out, coeff, width, width, color,
uvg_dequant(state, coeff_out, coeff, width, height, color,
cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
// Inverse low frequency non-separable transform
uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
}
if (use_trskip) {
uvg_itransformskip(state->encoder_control, residual, coeff, width);
uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
}
else {
uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
}
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
int y, x;
int sign, absval;
int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
sign = residual[x + y * width] >= 0 ? 1 : -1;
@ -739,14 +781,14 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
}
// Get quantized reconstruction. (residual + pred_in -> rec_out)
get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width);
get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width, height);
}
else if (rec_out != pred_in) {
// With no coeffs and rec_out == pred_int we skip copying the coefficients
// because the reconstruction is just the prediction.
int y, x;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
}
@ -763,20 +805,26 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
{
const encoder_control_t * const encoder = state->encoder_control;
if (encoder->cfg.dep_quant && !transform_skip) {
uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
return;
}
int32_t shift,add,coeff_q;
int32_t n;
int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];
const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
needs_block_size_trafo_scale |= 0; // Non log2 block size// Represents scaling through forward transform
int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
if (encoder->scaling_list.enable)
{
uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6];
@ -797,7 +845,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
}
}
} else {
int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
add = 1 << (shift-1);
__m256i v_scale = _mm256_set1_epi32(scale);
@ -845,8 +893,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
return parts[0] + parts[1] + parts[2] + parts[3];
}
static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
{
assert((width == height) && "Non-square block handling not implemented for this function.");
const __m256i zero = _mm256_setzero_si256();
const __m256i threes = _mm256_set1_epi16(3);
const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
@ -863,7 +912,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64
__m256i wts_lo = _mm256_broadcastsi128_si256(wts_lo_128);
__m256i wts_hi = _mm256_broadcastsi128_si256(wts_hi_128);
for (int i = 0; i < width * width; i += 32) {
for (int i = 0; i < width * height; i += 32) {
__m256i curr_lo = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
__m256i curr_abs_lo = _mm256_abs_epi16 (curr_lo);
__m256i curr_max3_lo = _mm256_min_epu16 (curr_abs_lo, threes);

View file

@ -771,6 +771,12 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
// DCT-2
#define DEFINE_DCT2_P2_MATRIX(a) \
{ \
a, a, \
a, -a \
}
#define DEFINE_DCT2_P4_MATRIX(a,b,c) \
{ \
a, a, a, a, \
@ -1002,6 +1008,7 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input,
}
// DCT-2
const int16_t uvg_g_DCT2P2[4] = DEFINE_DCT2_P2_MATRIX(64);
const int16_t uvg_g_DCT2P4[16] = DEFINE_DCT2_P4_MATRIX(64, 83, 36);
const int16_t uvg_g_DCT2P8[64] = DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18);
const int16_t uvg_g_DCT2P16[256] = DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9);
@ -1020,6 +1027,68 @@ const int16_t uvg_g_DCT8P16[256] = DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77
const int16_t uvg_g_DCT8P32[1024] = DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4);
// ********************************** DCT-2 **********************************
static void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
{
int32_t j;
int32_t E, O;
int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0;
const int16_t* iT = uvg_g_DCT2P2;
int16_t *p_coef = dst;
const int reduced_line = line - skip_line;
for (j = 0; j < reduced_line; j++)
{
/* E and O */
E = src[0] + src[1];
O = src[0] - src[1];
dst[0] = (iT[0] * E + add) >> shift;
dst[line] = (iT[2] * O + add) >> shift;
src += 2;
dst++;
}
if (skip_line)
{
dst = p_coef + reduced_line;
for (j = 0; j < 2; j++)
{
memset(dst, 0, sizeof(int16_t) * skip_line);
dst += line;
}
}
}
static void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2)
{
int32_t j;
int32_t E, O;
int32_t add = 1 << (shift - 1);
const int16_t* iT = uvg_g_DCT2P2;
const int reduced_line = line - skip_line;
for (j = 0; j < reduced_line; j++)
{
/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
E = iT[0] * (src[0] + src[line]);
O = iT[2] * (src[0] - src[line]);
/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
dst[0] = (short)CLIP(-32768, 32767, (E + add) >> shift);
dst[1] = (short)CLIP(-32768, 32767, (O + add) >> shift);
src++;
dst += 2;
}
if (skip_line)
{
memset(dst, 0, (skip_line << 1) * sizeof(int16_t));
}
}
static void fastForwardDCT2_B4(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
{
int32_t j;
@ -1366,11 +1435,6 @@ static void fastForwardDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift,
dst += line;
}
}
if (skip_line2) {
const int reduced_line = line - skip_line2;
dst = p_coef + reduced_line * 32;
memset(dst, 0, skip_line2 * 32 * sizeof(coeff_t));
}
}
static void fastInverseDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2)
@ -2417,16 +2481,16 @@ DCT_MTS_NXN_GENERIC(DST1, 32);
typedef void partial_tr_func(const int16_t*, int16_t*, int32_t, int, int, int);
// ToDo: Enable MTS 2x2 and 64x64 transforms
static partial_tr_func* dct_table[3][5] = {
{ fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
{ fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
{ fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
static partial_tr_func* dct_table[3][6] = {
{ fastForwardDCT2_B2, fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL },
{ NULL, fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL },
{ NULL, fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL },
};
static partial_tr_func* idct_table[3][5] = {
{ fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
{ fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
{ fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
static partial_tr_func* idct_table[3][6] = {
{ fastInverseDCT2_B2, fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ },
{ NULL, fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL },
{ NULL, fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL },
};
@ -2436,11 +2500,12 @@ static const tr_type_t mts_subset_intra[4][2] = { { DST7, DST7 }, { DCT8, DST7 }
void uvg_get_tr_type(
int8_t width,
int8_t height,
color_t color,
const cu_info_t* tu,
tr_type_t* hor_out,
tr_type_t* ver_out,
const int8_t mts_idx)
const int8_t mts_type)
{
*hor_out = DCT2;
*ver_out = DCT2;
@ -2450,13 +2515,19 @@ void uvg_get_tr_type(
return;
}
const int height = width;
const bool explicit_mts = mts_idx == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_idx == UVG_MTS_INTRA : (mts_idx == UVG_MTS_INTER && tu->type == CU_INTER));
const bool implicit_mts = tu->type == CU_INTRA && (mts_idx == UVG_MTS_IMPLICIT || mts_idx == UVG_MTS_INTER);
const bool explicit_mts = mts_type == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : (mts_type == UVG_MTS_INTER && tu->type == CU_INTER));
const bool implicit_mts = tu->type == CU_INTRA && (mts_type == UVG_MTS_IMPLICIT || mts_type == UVG_MTS_INTER);
assert(!(explicit_mts && implicit_mts));
const bool is_isp = tu->type == CU_INTRA && tu->intra.isp_mode && color == COLOR_Y ? tu->intra.isp_mode : 0;
const int8_t lfnst_idx = color == COLOR_Y ? tu->lfnst_idx : tu->cr_lfnst_idx;
// const bool is_sbt = cu->type == CU_INTER && tu->sbt && color == COLOR_Y; // TODO: check SBT here when implemented
if (implicit_mts)
if (is_isp && lfnst_idx) {
return;
}
if (implicit_mts || (is_isp && explicit_mts))
{
bool width_ok = width >= 4 && width <= 16;
bool height_ok = height >= 4 && height <= 16;
@ -2472,6 +2543,10 @@ void uvg_get_tr_type(
return;
}
/*
TODO: SBT HANDLING
*/
if (explicit_mts)
{
if (tu->tr_idx > MTS_SKIP) {
@ -2487,27 +2562,31 @@ static void mts_dct_generic(
const color_t color,
const cu_info_t* tu,
const int8_t width,
const int8_t height,
const int16_t* input,
int16_t* output,
const int8_t mts_idx)
const int8_t mts_type)
{
tr_type_t type_hor;
tr_type_t type_ver;
uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx)
if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
{
dct_func *dct_func = uvg_get_dct_func(width, color, tu->type);
dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type);
dct_func(bitdepth, input, output);
}
else
{
const int height = width;
int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0);
int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0);
const int log2_width_minus2 = uvg_g_convert_to_bit[width];
if(tu->lfnst_idx || tu->cr_lfnst_idx) {
const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
//const int log2_width_minus2 = uvg_g_convert_to_bit[width];
//const int log2_height_minus2 = uvg_g_convert_to_bit[height];
if((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
if ((width == 4 && height > 4) || (width > 4 && height == 4))
{
skip_width = width - 4;
@ -2520,16 +2599,21 @@ static void mts_dct_generic(
}
}
partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2];
partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2];
partial_tr_func* dct_hor = width != 1 ? dct_table[type_hor][log2_width_minus1] : NULL;
partial_tr_func* dct_ver = height != 1 ? dct_table[type_ver][log2_height_minus1] : NULL;
int16_t tmp[32 * 32];
const int32_t shift_1st = log2_width_minus2 + bitdepth - 7;
const int32_t shift_2nd = log2_width_minus2 + 8;
const int32_t shift_1st = log2_width_minus1 + bitdepth - 8;
const int32_t shift_2nd = log2_height_minus1 + 7;
if (height == 1) {
dct_hor(input, output, shift_1st, height, 0, skip_width);
} else if (width == 1) {
dct_ver(input, output, log2_height_minus1 + 1 + bitdepth + 6 - 15, width, 0, skip_height);
} else {
dct_hor(input, tmp, shift_1st, height, 0, skip_width);
dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height);
}
}
}
@ -2538,37 +2622,58 @@ static void mts_idct_generic(
const color_t color,
const cu_info_t* tu,
const int8_t width,
const int8_t height,
const int16_t* input,
int16_t* output,
const int8_t mts_idx)
const int8_t mts_type)
{
tr_type_t type_hor;
tr_type_t type_ver;
uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx);
uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type);
if (type_hor == DCT2 && type_ver == DCT2)
if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height)
{
dct_func *idct_func = uvg_get_idct_func(width, color, tu->type);
dct_func *idct_func = uvg_get_idct_func(width, height, color, tu->type);
idct_func(bitdepth, input, output);
}
else
{
const int height = width;
const int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
const int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
const int log2_width_minus2 = uvg_g_convert_to_bit[width];
int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0;
int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0;
const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2];
partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2];
if ((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) {
if ((width == 4 && height > 4) || (width > 4 && height == 4)) {
skip_width = width - 4;
skip_height = height - 4;
}
else if ((width >= 8 && height >= 8)) {
skip_width = width - 8;
skip_height = height - 8;
}
}
partial_tr_func* idct_hor = width != 1 ? idct_table[type_hor][log2_width_minus1] : NULL;
partial_tr_func* idct_ver = height != 1 ? idct_table[type_ver][log2_height_minus1] : NULL;
int16_t tmp[32 * 32];
const int32_t shift_1st = 7;
const int32_t shift_2nd = 20 - bitdepth;
const int max_log2_tr_dynamic_range = 15;
const int transform_matrix_shift = 6;
const int32_t shift_1st = transform_matrix_shift + 1;
const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth;
if (height == 1) {
idct_hor(input, output, shift_2nd + 1, height, 0, skip_width);
} else if (width == 1) {
idct_ver(input, output, shift_2nd + 1, width, 0, skip_height);
} else {
idct_ver(input, tmp, shift_1st, width, skip_width, skip_height);
idct_hor(tmp, output, shift_2nd, height, 0, skip_width);
}
}
}
@ -2582,6 +2687,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth)
success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic);
success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic);
success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic);
//success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic);
success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic);

View file

@ -0,0 +1,252 @@
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
#include "strategies/generic/depquant-generic.h"
#include "dep_quant.h"
#include "cu.h"
#include "encoderstate.h"
#include "intra.h"
#include "rdo.h"
#include "strategyselector.h"
#include "transform.h"
#include "uvg_math.h"
#include "generic/quant-generic.h"
static const int32_t g_goRiceBits[4][RICEMAX] = {
{32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144,
327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216,
393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752,
458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
{65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840,
196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912,
360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448,
425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
{98304, 98304, 98304, 98304, 131072, 131072, 131072, 131072,
163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608,
229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144,
327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
{131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072,
163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840,
196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608,
229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
};
static INLINE void checkRdCostSkipSbbZeroOut(
Decision* decision,
const all_depquant_states* const state,
int decision_id,
int skip_offset) {
int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
decision->rdCost[decision_id] = rdCost;
decision->absLevel[decision_id] = 0;
decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
}
static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
{
int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
if (rdCost < decisions->rdCost[decision_id])
{
decisions->rdCost[decision_id] = rdCost;
decisions->absLevel[decision_id] = 0;
decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
}
}
static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
decision_id)
{
int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
if (pqData->absLevel[decision_id] < 4) {
rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
}
else {
const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+ g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
}
if (rdCost < decisions->rdCost[decision_id]) {
decisions->rdCost[decision_id] = rdCost;
decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
decisions->prevId[decision_id] = -1;
}
}
static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
.absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
{
int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
coeff_t qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
int index = qIdx & 3;
pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
pqData->absLevel[index] = (++qIdx) >> 1;
scaledAdd += qp->m_DistStepAdd;
index = qIdx & 3;
pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
pqData->absLevel[index] = (++qIdx) >> 1;
scaledAdd += qp->m_DistStepAdd;
index = qIdx & 3;
pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
pqData->absLevel[index] = (++qIdx) >> 1;
scaledAdd += qp->m_DistStepAdd;
index = qIdx & 3;
pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
pqData->absLevel[index] = (++qIdx) >> 1;
}
static void xDecide(
all_depquant_states* const all_states,
depquant_state* const m_startState,
quant_block* qp,
const enum ScanPosType spt,
const coeff_t absCoeff,
const int lastOffset,
Decision* decisions,
bool zeroOut,
coeff_t quanCoeff,
const int skip_offset,
const int prev_offset)
{
memcpy(decisions, &startDec, sizeof(Decision));
if (zeroOut) {
if (spt == SCAN_EOCSBB) {
checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
}
return;
}
PQData pqData;
preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
if (spt == SCAN_EOCSBB) {
checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
}
checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
}
static void uvg_dep_quant_decide_and_update_generic(
rate_estimator_t* re,
context_store* ctxs,
struct dep_quant_scan_info const* const scan_info,
const coeff_t absCoeff,
const uint32_t scan_pos,
const uint32_t width_in_sbb,
const uint32_t height_in_sbb,
const NbInfoSbb next_nb_info_ssb,
bool zeroOut,
coeff_t quantCoeff,
const uint32_t effWidth,
const uint32_t effHeight,
bool is_chroma)
{
Decision* decisions = &ctxs->m_trellis[scan_pos];
SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
enum ScanPosType spt = 0;
if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
{
spt = SCAN_SOCSBB;
}
else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
{
spt = SCAN_EOCSBB;
}
xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
if (scan_pos) {
if (!(scan_pos & 15)) {
SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
} else if (!zeroOut) {
uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0);
uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1);
uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2);
uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3);
}
if (spt == SCAN_SOCSBB) {
SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
}
}
}
void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
{
const int default_quant_coeff = dep_quant_context->m_quant->m_QScale;
const int32_t thres = dep_quant_context->m_quant->m_thresLast;
int temp = *firstTestPos;
for (; temp >= 0; (temp)--) {
coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff));
if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
break;
}
}
*firstTestPos = temp;
}
int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
{
bool success = true;
success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic);
success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic);
return success;
}

View file

@ -0,0 +1,50 @@
#ifndef STRATEGIES_DEPQUANT_GENERIC_H_
#define STRATEGIES_DEPQUANT_GENERIC_H_
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Generic C implementations of optimized functions.
*/
#include "cu.h"
#include "encoderstate.h"
#include "global.h" // IWYU pragma: keep
#include "uvg266.h"
#include "tables.h"
int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth);
#endif //STRATEGIES_DEPQUANT_GENERIC_H_

View file

@ -54,11 +54,16 @@
void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
uint8_t width,
const cu_loc_t * const cu_loc,
uint8_t color,
int8_t scan_mode,
cu_info_t* cur_cu,
double* bits_out) {
double* bits_out)
{
const int x = cu_loc->x;
const int y = cu_loc->y;
const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
//const encoder_control_t * const encoder = state->encoder_control;
//int c1 = 1;
@ -75,12 +80,12 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
// CONSTANTS
const int height = width; // TODO: height for non-square blocks.
const uint32_t log2_block_size = uvg_g_convert_to_bit[width]+2;
const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1];
const uint32_t *scan =
uvg_g_sig_last_scan[scan_mode][log2_block_size - 1];
const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode];
const uint8_t log2_block_width = uvg_g_convert_to_log2[width];
const uint8_t log2_block_height = uvg_g_convert_to_log2[height];
const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height);
// Init base contexts according to block type
@ -90,12 +95,13 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
unsigned scan_cg_last = (unsigned)-1;
unsigned scan_pos_last = (unsigned)-1;
for (int i = 0; i < width * width; i++) {
for (int i = 0; i < (width * height); ++i) {
if (coeff[scan[i]]) {
scan_pos_last = i;
sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1;
}
}
scan_cg_last = scan_pos_last >> log2_cg_size;
int pos_last = scan[scan_pos_last];
@ -120,28 +126,33 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
last_coeff_x,
last_coeff_y,
width,
width,
height,
color,
scan_mode,
bits_out);
uint32_t quant_state_transition_table = 0; //ToDo: dep quant enable changes this
uint32_t quant_state_transition_table = state->encoder_control->cfg.dep_quant ? 32040 : 0;
int32_t quant_state = 0;
uint8_t ctx_offset[16];
int32_t temp_diag = -1;
int32_t temp_sum = -1;
int32_t reg_bins = (width*width * 28) >> 4; //8 for 2x2
int32_t reg_bins = (width * height * 28) >> 4; //8 for 2x2
// significant_coeff_flag
for (i = scan_cg_last; i >= 0; i--) {
//int32_t abs_coeff[64*64];
const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0];
const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width);
const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height);
int32_t cg_blk_pos = scan_cg[i];
int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> (log2_cg_size / 2));
int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> log2_cg_width);
int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> log2_cg_width));
// !!! residual_coding_subblock() !!!
@ -151,7 +162,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
} else {
uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0);
uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
cg_pos_y, (MIN((uint8_t)32, width) >> (log2_cg_size / 2)));
cg_pos_y, cg_width, cg_height);
CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctx[ctx_sig], sig_coeff_group, bits, "significant_coeffgroup_flag");
}
@ -182,7 +193,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
sig = (coeff[blk_pos] != 0) ? 1 : 0;
if (num_non_zero || next_sig_pos != infer_sig_pos) {
ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]);
cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]);
@ -190,7 +201,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
reg_bins--;
} else if (next_sig_pos != scan_pos_last) {
ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum);
ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum);
}
@ -256,7 +267,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
blk_pos = scan[scan_pos];
pos_y = blk_pos / width;
pos_x = blk_pos - (pos_y * width);
int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4);
int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4);
rice_param = g_go_rice_pars[abs_sum];
uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]);
@ -274,7 +285,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
pos_y = blk_pos / width;
pos_x = blk_pos - (pos_y * width);
uint32_t coeff_abs = abs(coeff[blk_pos]);
int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0);
int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0);
rice_param = g_go_rice_pars[abs_sum];
pos0 = ((quant_state<2)?1:2) << rice_param;
uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs);
@ -291,7 +302,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
uint32_t num_signs = num_non_zero;
if (state->encoder_control->cfg.signhide_enable && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
if (state->encoder_control->cfg.signhide_enable && !state->encoder_control->cfg.dep_quant && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) {
num_signs--;
coeff_signs >>= 1;
}

View file

@ -44,7 +44,7 @@
void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
uint8_t width,
const cu_loc_t * const loc,
uint8_t color,
int8_t scan_mode,
cu_info_t* cur_cu,

View file

@ -34,6 +34,7 @@
#include <stdlib.h>
#include "cu.h"
#include "intra.h"
#include "uvg266.h"
#include "strategyselector.h"
@ -42,25 +43,32 @@
/**
* \brief Generate angular predictions.
* \param log2_width Log2 of width, range 2..5.
* \param cu_loc CU location and size data.
* \param intra_mode Angular mode in range 2..34.
* \param channel_type Color channel.
* \param in_ref_above Pointer to -1 index of above reference, length=width*2+1.
* \param in_ref_left Pointer to -1 index of left reference, length=width*2+1.
* \param in_ref_left Pointer to -1 index of left reference, length=height*2+1.
* \param dst Buffer of size width*width.
* \param multi_ref_idx Multi reference line index for use with MRL.
*/
static void uvg_angular_pred_generic(
const int_fast8_t log2_width,
const cu_loc_t* const cu_loc,
const int_fast8_t intra_mode,
const int_fast8_t channel_type,
const uvg_pixel *const in_ref_above,
const uvg_pixel *const in_ref_left,
uvg_pixel *const dst,
const uint8_t multi_ref_idx)
const uint8_t multi_ref_idx,
const uint8_t isp_mode,
const int cu_dim)
{
int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int log2_width = uvg_g_convert_to_log2[width];
const int log2_height = uvg_g_convert_to_log2[height];
assert(log2_width >= 2 && log2_width <= 5);
assert(intra_mode >= 2 && intra_mode <= 66);
assert((log2_width >= 2 && log2_width <= 5) && log2_height <= 5);
// assert(intra_mode >= 2 && intra_mode <= 66);
static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 };
static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp
@ -105,126 +113,105 @@ static void uvg_angular_pred_generic(
// Temporary buffer for modes 11-25.
// It only needs to be big enough to hold indices from -width to width-1.
uvg_pixel temp_dst[TR_MAX_WIDTH * TR_MAX_WIDTH];
// TODO: check the correct size for these arrays when MRL is used
//uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
const int_fast32_t width = 1 << log2_width;
uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
uvg_pixel temp_left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
uint32_t pred_mode = intra_mode; // ToDo: handle WAIP
uint8_t multi_ref_index = multi_ref_idx;
uint8_t isp = isp_mode;
// Whether to swap references to always project on the left reference row.
const bool vertical_mode = intra_mode >= 34;
// Modes distance to horizontal or vertical mode.
const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -((int32_t)pred_mode - 18);
//const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
// Sample displacement per column in fractions of 32.
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
const int16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
// TODO: replace latter width with height
int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);
const int side_size = vertical_mode ? log2_height : log2_width;
int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]);
// Pointer for the reference we are interpolating from.
uvg_pixel *ref_main;
// Pointer for the other reference.
const uvg_pixel *ref_side;
uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst;
const int top_ref_length = isp_mode == ISP_MODE_VER ? width + cu_dim : width << 1;
const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1;
// Set ref_main and ref_side such that, when indexed with 0, they point to
// index 0 in block coordinates.
if (sample_disp < 0) {
memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel));
memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel));
// TODO: for non-square blocks, separate loops for x and y is needed
for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
ref_main = vertical_mode ? temp_above + height : temp_left + width;
ref_side = vertical_mode ? temp_left + width : temp_above + height;
int size_side = vertical_mode ? height : width;
for (int i = -size_side; i <= -1; i++) {
ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)];
}
// TODO: take into account non-square blocks
ref_main = temp_main + width;
ref_side = temp_side + width;
// TODO: for non square blocks, need to check if width or height is used for reference extension
for (int i = -width; i <= -1; i++) {
ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)];
}
//const uint32_t index_offset = width + 1;
//const int32_t last_index = width;
//const int_fast32_t most_negative_index = (width * sample_disp) >> 5;
//// Negative sample_disp means, we need to use both references.
//// TODO: update refs to take into account variating block size and shapes
//// (height is not always equal to width)
//ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
//ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
//// Move the reference pixels to start from the middle to the later half of
//// the tmp_ref, so there is room for negative indices.
//for (int_fast32_t x = -1; x < width; ++x) {
// tmp_ref[x + index_offset] = ref_main[x];
//}
//// Get a pointer to block index 0 in tmp_ref.
//ref_main = &tmp_ref[index_offset];
//tmp_ref[index_offset -1] = tmp_ref[index_offset];
//// Extend the side reference to the negative indices of main reference.
//int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
//int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
//// TODO: add 'vertical_mode ? height : width' instead of 'width'
//
//for (int_fast32_t x = -1; x > most_negative_index; x--) {
// col_sample_disp += inv_abs_sample_disp;
// int_fast32_t side_index = col_sample_disp >> 8;
// tmp_ref[x + index_offset - 1] = ref_side[side_index - 1];
//}
//tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1];
//tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset];
}
else {
memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
memcpy(&temp_left[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel));
// TODO: again, separate loop needed for non-square blocks
for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
}
ref_main = vertical_mode ? temp_above : temp_left;
ref_side = vertical_mode ? temp_left : temp_above;
// TODO: this code block will need to change also when non-square blocks are used
// const int log2_ratio = 0;
const int s = 0;
const int log2_ratio = log2_width - log2_height;
const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
const int max_index = (multi_ref_index << s) + 2;
const int ref_length = width << 1;
const uvg_pixel val = temp_main[ref_length + multi_ref_index];
int ref_length;
if (isp_mode) {
ref_length = vertical_mode ? top_ref_length : left_ref_length;
}
else {
ref_length = vertical_mode ? width << 1 : height << 1;
}
const uvg_pixel val = ref_main[ref_length + multi_ref_index];
for (int j = 1; j <= max_index; j++) {
temp_main[ref_length + multi_ref_index + j] = val;
ref_main[ref_length + multi_ref_index + j] = val;
}
}
ref_main = temp_main;
ref_side = temp_side;
//// sample_disp >= 0 means we don't need to refer to negative indices,
//// which means we can just use the references as is.
//ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
//ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
//memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(uvg_pixel));
//ref_main = &tmp_ref[width];
//tmp_ref[width-1] = tmp_ref[width];
//int8_t last_index = 1 + width*2;
//tmp_ref[width + last_index] = tmp_ref[width + last_index - 1];
}
// compensate for line offset in reference line buffers
ref_main += multi_ref_index;
ref_side += multi_ref_index;
if (!vertical_mode) { SWAP(width, height, int) }
if (sample_disp != 0) {
bool use_cubic = true; // Default to cubic filter
static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
if (dist_from_vert_or_hor > filter_threshold) {
if ((abs(sample_disp) & 0x1F) != 0)
{
use_cubic = false;
}
}
// Cubic must be used if ref line != 0 or if isp mode is != 0
if (multi_ref_index || isp) {
use_cubic = true;
}
// The mode is not horizontal or vertical, we have to do interpolation.
for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < width; ++y, delta_pos += sample_disp) {
for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
int_fast32_t delta_int = delta_pos >> 5;
int_fast32_t delta_fract = delta_pos & (32 - 1);
const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
if ((abs(sample_disp) & 0x1F) != 0) {
@ -232,25 +219,7 @@ static void uvg_angular_pred_generic(
if (channel_type == 0) {
int32_t ref_main_index = delta_int;
uvg_pixel p[4];
bool use_cubic = true; // Default to cubic filter
static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
if (dist_from_vert_or_hor > filter_threshold) {
static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 };
const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
if ((abs(sample_disp) & 0x1F) != 0)
{
use_cubic = false;
}
}
// Cubic must be used if ref line != 0
if (multi_ref_index) {
use_cubic = true;
}
const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
// Do 4-tap intra interpolation filtering
for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
p[0] = ref_main[ref_main_index];
@ -258,7 +227,7 @@ static void uvg_angular_pred_generic(
p[2] = ref_main[ref_main_index + 2];
p[3] = ref_main[ref_main_index + 3];
dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
work[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
}
}
@ -268,26 +237,26 @@ static void uvg_angular_pred_generic(
for (int_fast32_t x = 0; x < width; ++x) {
uvg_pixel ref1 = ref_main[x + delta_int + 1];
uvg_pixel ref2 = ref_main[x + delta_int + 2];
dst[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
work[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
}
}
}
else {
// Just copy the integer samples
for (int_fast32_t x = 0; x < width; x++) {
dst[y * width + x] = ref_main[x + delta_int + 1];
work[y * width + x] = ref_main[x + delta_int + 1];
}
}
// PDPC
bool PDPC_filter = (width >= 4 || channel_type != 0);
bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0;
if (pred_mode > 1 && pred_mode < 67) {
if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL.
PDPC_filter = false;
}
else if (mode_disp > 0) {
PDPC_filter = (scale >= 0);
PDPC_filter &= (scale >= 0);
}
}
if(PDPC_filter) {
@ -297,70 +266,50 @@ static void uvg_angular_pred_generic(
int wL = 32 >> (2 * x >> scale);
const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1];
dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6);
work[y * width + x] = work[y * width + x] + ((wL * (left - work[y * width + x]) + 32) >> 6);
}
}
/*
if (pred_mode == 2 || pred_mode == 66) {
int wT = 16 >> MIN(31, ((y << 1) >> scale));
for (int x = 0; x < width; x++) {
int wL = 16 >> MIN(31, ((x << 1) >> scale));
if (wT + wL == 0) break;
int c = x + y + 1;
if (c >= 2 * width) { wL = 0; }
if (c >= 2 * width) { wT = 0; }
const uvg_pixel left = (wL != 0) ? ref_side[c] : 0;
const uvg_pixel top = (wT != 0) ? ref_main[c] : 0;
dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6);
}
} else if (sample_disp == 0 || sample_disp >= 12) {
int inv_angle_sum_0 = 2;
for (int x = 0; x < width; x++) {
inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)];
int delta_pos_0 = inv_angle_sum_0 >> 2;
int delta_frac_0 = delta_pos_0 & 63;
int delta_int_0 = delta_pos_0 >> 6;
int delta_y = y + delta_int_0 + 1;
// TODO: convert to JVET_K0500_WAIP
if (delta_y > width + width - 1) break;
int wL = 32 >> MIN(31, ((x << 1) >> scale));
if (wL == 0) break;
const uvg_pixel *p = ref_side + delta_y - 1;
uvg_pixel left = p[delta_frac_0 >> 5];
dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6);
}
}*/
}
}
else {
// Mode is horizontal or vertical, just copy the pixels.
// TODO: update outer loop to use height instead of width
for (int_fast32_t y = 0; y < width; ++y) {
for (int_fast32_t x = 0; x < width; ++x) {
dst[y * width + x] = ref_main[x + 1];
}
// Do not apply PDPC if multi ref line index is other than 0
if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
int scale = (log2_width + log2_width - 2) >> 2;
// TODO: do not do PDPC if block is in BDPCM mode
bool do_pdpc = ((width >= 4 && height >= 4) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/);
if (do_pdpc) {
int scale = (log2_width + log2_height - 2) >> 2;
const uvg_pixel top_left = ref_main[0];
for (int_fast32_t y = 0; y < height; ++y) {
memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
const uvg_pixel left = ref_side[1 + y];
for (int i = 0; i < MIN(3 << scale, width); i++) {
const int wL = 32 >> (2 * i >> scale);
const uvg_pixel val = dst[y * width + i];
dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
const int wL = 32 >> (2 * x >> scale);
const uvg_pixel val = work[y * width + x];
work[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
}
}
} else {
for (int_fast32_t y = 0; y < height; ++y) {
memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel));
}
}
}
// Flip the block if this is was a horizontal mode.
if (!vertical_mode) {
for (int_fast32_t y = 0; y < width - 1; ++y) {
if(width == height) {
for (int_fast32_t y = 0; y < height - 1; ++y) {
for (int_fast32_t x = y + 1; x < width; ++x) {
SWAP(dst[y * width + x], dst[x * width + y], uvg_pixel);
SWAP(work[y * height + x], work[x * width + y], uvg_pixel);
}
}
} else {
for(int y = 0; y < width; ++y) {
for(int x = 0; x < height; ++x) {
dst[x + y * height] = work[y + x * width];
}
}
}
}
@ -369,23 +318,32 @@ static void uvg_angular_pred_generic(
/**
* \brief Generate planar prediction.
* \param log2_width Log2 of width, range 2..5.
* \param cu_loc CU location and size data.
* \param color Color channel.
* \param in_ref_above Pointer to -1 index of above reference, length=width*2+1.
* \param in_ref_left Pointer to -1 index of left reference, length=width*2+1.
* \param dst Buffer of size width*width.
*/
static void uvg_intra_pred_planar_generic(
const int_fast8_t log2_width,
const cu_loc_t* const cu_loc,
color_t color,
const uvg_pixel *const ref_top,
const uvg_pixel *const ref_left,
uvg_pixel *const dst)
{
// TODO: Add height
assert(log2_width >= 2 && log2_width <= 5);
const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int log2_width = uvg_g_convert_to_log2[width];
const int log2_height = uvg_g_convert_to_log2[height];
const int offset = 1 << (log2_width + log2_height);
const int final_shift = 1 + log2_width + log2_height;
// If ISP is enabled log_dim 1 is possible (limit was previously 2)
assert((log2_width >= 2 && log2_width <= 5) && log2_height <= 5);
const int_fast8_t width = 1 << log2_width;
const uvg_pixel top_right = ref_top[width + 1];
const uvg_pixel bottom_left = ref_left[width + 1];
const uvg_pixel bottom_left = ref_left[height + 1];
#if 0
// Unoptimized version for reference.
@ -397,18 +355,27 @@ static void uvg_intra_pred_planar_generic(
}
}
#else
int_fast16_t top[32];
// TODO: get rid of magic numbers. Make a define for this
int_fast16_t top[64];
int_fast16_t bottom[64];
int_fast16_t left[64];
int_fast16_t right[64];
for (int i = 0; i < width; ++i) {
top[i] = ref_top[i + 1] << log2_width;
bottom[i] = bottom_left - ref_top[i + 1];
top[i] = ref_top[i + 1] << log2_height;
}
for (int y = 0; y < width; ++y) {
int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
for (int j = 0; j < height; ++j) {
right[j] = top_right - ref_left[j + 1];
left[j] = ref_left[j + 1] << log2_width;
}
for (int y = 0; y < height; ++y) {
int_fast16_t hor = left[y];
for (int x = 0; x < width; ++x) {
hor += top_right - ref_left[y + 1];
top[x] += bottom_left - ref_top[x + 1];
dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
//
hor += right[y];
top[x] += bottom[x];
dst[y * width + x] = ((hor << log2_height) + (top[x] << log2_width) + offset) >> final_shift;
}
}
#endif
@ -461,25 +428,26 @@ static void uvg_intra_pred_filtered_dc_generic(
/**
* \brief Position Dependent Prediction Combination for Planar and DC modes.
* \param log2_width Log2 of width, range 2..5.
* \param width Block width matching log2_width.
* \param cu_loc CU location and size data.
* \param used_ref Pointer used reference pixel struct.
* \param dst Buffer of size width*width.
*/
static void uvg_pdpc_planar_dc_generic(
const int mode,
const int width,
const int log2_width,
const cu_loc_t* const cu_loc,
const color_t color,
const uvg_intra_ref *const used_ref,
uvg_pixel *const dst)
{
assert(mode == 0 || mode == 1); // planar or DC
const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
const int log2_width = uvg_g_convert_to_log2[width];
const int log2_height = uvg_g_convert_to_log2[height];
// TODO: replace latter log2_width with log2_height
const int scale = ((log2_width - 2 + log2_width - 2 + 2) >> 2);
const int scale = (log2_width + log2_height - 2) >> 2;
// TODO: replace width with height
for (int y = 0; y < width; y++) {
for (int y = 0; y < height; y++) {
int wT = 32 >> MIN(31, ((y << 1) >> scale));
for (int x = 0; x < width; x++) {
int wL = 32 >> MIN(31, ((x << 1) >> scale));

View file

@ -32,6 +32,7 @@
#include "strategies/generic/picture-generic.h"
#include <math.h>
#include <stdlib.h>
#include "strategies/strategies-picture.h"
@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel)
SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
static uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
uint64_t satd = 0;
coeff_t diff[4], m[4];
diff[0] = piOrg[0] - piCur[0];
diff[1] = piOrg[1] - piCur[1];
diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur];
diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
m[0] = diff[0] + diff[2];
m[1] = diff[1] + diff[3];
m[2] = diff[0] - diff[2];
m[3] = diff[1] - diff[3];
satd += abs(m[0] + m[1]) >> 2;
satd += abs(m[0] - m[1]);
satd += abs(m[2] + m[3]);
satd += abs(m[2] - m[3]);
return satd;
}
static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{ //need to add SIMD implementation ,JCA
int k, i, j, jj, sad = 0;
int diff[128], m1[8][16], m2[8][16];
for (k = 0; k < 128; k += 16)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
diff[k + 4] = piOrg[4] - piCur[4];
diff[k + 5] = piOrg[5] - piCur[5];
diff[k + 6] = piOrg[6] - piCur[6];
diff[k + 7] = piOrg[7] - piCur[7];
diff[k + 8] = piOrg[8] - piCur[8];
diff[k + 9] = piOrg[9] - piCur[9];
diff[k + 10] = piOrg[10] - piCur[10];
diff[k + 11] = piOrg[11] - piCur[11];
diff[k + 12] = piOrg[12] - piCur[12];
diff[k + 13] = piOrg[13] - piCur[13];
diff[k + 14] = piOrg[14] - piCur[14];
diff[k + 15] = piOrg[15] - piCur[15];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 8; j++)
{
jj = j << 4;
m2[j][0] = diff[jj] + diff[jj + 8];
m2[j][1] = diff[jj + 1] + diff[jj + 9];
m2[j][2] = diff[jj + 2] + diff[jj + 10];
m2[j][3] = diff[jj + 3] + diff[jj + 11];
m2[j][4] = diff[jj + 4] + diff[jj + 12];
m2[j][5] = diff[jj + 5] + diff[jj + 13];
m2[j][6] = diff[jj + 6] + diff[jj + 14];
m2[j][7] = diff[jj + 7] + diff[jj + 15];
m2[j][8] = diff[jj] - diff[jj + 8];
m2[j][9] = diff[jj + 1] - diff[jj + 9];
m2[j][10] = diff[jj + 2] - diff[jj + 10];
m2[j][11] = diff[jj + 3] - diff[jj + 11];
m2[j][12] = diff[jj + 4] - diff[jj + 12];
m2[j][13] = diff[jj + 5] - diff[jj + 13];
m2[j][14] = diff[jj + 6] - diff[jj + 14];
m2[j][15] = diff[jj + 7] - diff[jj + 15];
m1[j][0] = m2[j][0] + m2[j][4];
m1[j][1] = m2[j][1] + m2[j][5];
m1[j][2] = m2[j][2] + m2[j][6];
m1[j][3] = m2[j][3] + m2[j][7];
m1[j][4] = m2[j][0] - m2[j][4];
m1[j][5] = m2[j][1] - m2[j][5];
m1[j][6] = m2[j][2] - m2[j][6];
m1[j][7] = m2[j][3] - m2[j][7];
m1[j][8] = m2[j][8] + m2[j][12];
m1[j][9] = m2[j][9] + m2[j][13];
m1[j][10] = m2[j][10] + m2[j][14];
m1[j][11] = m2[j][11] + m2[j][15];
m1[j][12] = m2[j][8] - m2[j][12];
m1[j][13] = m2[j][9] - m2[j][13];
m1[j][14] = m2[j][10] - m2[j][14];
m1[j][15] = m2[j][11] - m2[j][15];
m2[j][0] = m1[j][0] + m1[j][2];
m2[j][1] = m1[j][1] + m1[j][3];
m2[j][2] = m1[j][0] - m1[j][2];
m2[j][3] = m1[j][1] - m1[j][3];
m2[j][4] = m1[j][4] + m1[j][6];
m2[j][5] = m1[j][5] + m1[j][7];
m2[j][6] = m1[j][4] - m1[j][6];
m2[j][7] = m1[j][5] - m1[j][7];
m2[j][8] = m1[j][8] + m1[j][10];
m2[j][9] = m1[j][9] + m1[j][11];
m2[j][10] = m1[j][8] - m1[j][10];
m2[j][11] = m1[j][9] - m1[j][11];
m2[j][12] = m1[j][12] + m1[j][14];
m2[j][13] = m1[j][13] + m1[j][15];
m2[j][14] = m1[j][12] - m1[j][14];
m2[j][15] = m1[j][13] - m1[j][15];
m1[j][0] = m2[j][0] + m2[j][1];
m1[j][1] = m2[j][0] - m2[j][1];
m1[j][2] = m2[j][2] + m2[j][3];
m1[j][3] = m2[j][2] - m2[j][3];
m1[j][4] = m2[j][4] + m2[j][5];
m1[j][5] = m2[j][4] - m2[j][5];
m1[j][6] = m2[j][6] + m2[j][7];
m1[j][7] = m2[j][6] - m2[j][7];
m1[j][8] = m2[j][8] + m2[j][9];
m1[j][9] = m2[j][8] - m2[j][9];
m1[j][10] = m2[j][10] + m2[j][11];
m1[j][11] = m2[j][10] - m2[j][11];
m1[j][12] = m2[j][12] + m2[j][13];
m1[j][13] = m2[j][12] - m2[j][13];
m1[j][14] = m2[j][14] + m2[j][15];
m1[j][15] = m2[j][14] - m2[j][15];
}
//vertical
for (i = 0; i < 16; i++)
{
m2[0][i] = m1[0][i] + m1[4][i];
m2[1][i] = m1[1][i] + m1[5][i];
m2[2][i] = m1[2][i] + m1[6][i];
m2[3][i] = m1[3][i] + m1[7][i];
m2[4][i] = m1[0][i] - m1[4][i];
m2[5][i] = m1[1][i] - m1[5][i];
m2[6][i] = m1[2][i] - m1[6][i];
m2[7][i] = m1[3][i] - m1[7][i];
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m1[4][i] = m2[4][i] + m2[6][i];
m1[5][i] = m2[5][i] + m2[7][i];
m1[6][i] = m2[4][i] - m2[6][i];
m1[7][i] = m2[5][i] - m2[7][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
m2[4][i] = m1[4][i] + m1[5][i];
m2[5][i] = m1[4][i] - m1[5][i];
m2[6][i] = m1[6][i] + m1[7][i];
m2[7][i] = m1[6][i] - m1[7][i];
}
for (i = 0; i < 8; i++)
{
for (j = 0; j < 16; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(16.0 * 8) * 2);
return sad;
}
static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
int k, i, j, jj, sad = 0;
int diff[128], m1[16][8], m2[16][8];
for (k = 0; k < 128; k += 8)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
diff[k + 4] = piOrg[4] - piCur[4];
diff[k + 5] = piOrg[5] - piCur[5];
diff[k + 6] = piOrg[6] - piCur[6];
diff[k + 7] = piOrg[7] - piCur[7];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 16; j++)
{
jj = j << 3;
m2[j][0] = diff[jj] + diff[jj + 4];
m2[j][1] = diff[jj + 1] + diff[jj + 5];
m2[j][2] = diff[jj + 2] + diff[jj + 6];
m2[j][3] = diff[jj + 3] + diff[jj + 7];
m2[j][4] = diff[jj] - diff[jj + 4];
m2[j][5] = diff[jj + 1] - diff[jj + 5];
m2[j][6] = diff[jj + 2] - diff[jj + 6];
m2[j][7] = diff[jj + 3] - diff[jj + 7];
m1[j][0] = m2[j][0] + m2[j][2];
m1[j][1] = m2[j][1] + m2[j][3];
m1[j][2] = m2[j][0] - m2[j][2];
m1[j][3] = m2[j][1] - m2[j][3];
m1[j][4] = m2[j][4] + m2[j][6];
m1[j][5] = m2[j][5] + m2[j][7];
m1[j][6] = m2[j][4] - m2[j][6];
m1[j][7] = m2[j][5] - m2[j][7];
m2[j][0] = m1[j][0] + m1[j][1];
m2[j][1] = m1[j][0] - m1[j][1];
m2[j][2] = m1[j][2] + m1[j][3];
m2[j][3] = m1[j][2] - m1[j][3];
m2[j][4] = m1[j][4] + m1[j][5];
m2[j][5] = m1[j][4] - m1[j][5];
m2[j][6] = m1[j][6] + m1[j][7];
m2[j][7] = m1[j][6] - m1[j][7];
}
//vertical
for (i = 0; i < 8; i++)
{
m1[0][i] = m2[0][i] + m2[8][i];
m1[1][i] = m2[1][i] + m2[9][i];
m1[2][i] = m2[2][i] + m2[10][i];
m1[3][i] = m2[3][i] + m2[11][i];
m1[4][i] = m2[4][i] + m2[12][i];
m1[5][i] = m2[5][i] + m2[13][i];
m1[6][i] = m2[6][i] + m2[14][i];
m1[7][i] = m2[7][i] + m2[15][i];
m1[8][i] = m2[0][i] - m2[8][i];
m1[9][i] = m2[1][i] - m2[9][i];
m1[10][i] = m2[2][i] - m2[10][i];
m1[11][i] = m2[3][i] - m2[11][i];
m1[12][i] = m2[4][i] - m2[12][i];
m1[13][i] = m2[5][i] - m2[13][i];
m1[14][i] = m2[6][i] - m2[14][i];
m1[15][i] = m2[7][i] - m2[15][i];
m2[0][i] = m1[0][i] + m1[4][i];
m2[1][i] = m1[1][i] + m1[5][i];
m2[2][i] = m1[2][i] + m1[6][i];
m2[3][i] = m1[3][i] + m1[7][i];
m2[4][i] = m1[0][i] - m1[4][i];
m2[5][i] = m1[1][i] - m1[5][i];
m2[6][i] = m1[2][i] - m1[6][i];
m2[7][i] = m1[3][i] - m1[7][i];
m2[8][i] = m1[8][i] + m1[12][i];
m2[9][i] = m1[9][i] + m1[13][i];
m2[10][i] = m1[10][i] + m1[14][i];
m2[11][i] = m1[11][i] + m1[15][i];
m2[12][i] = m1[8][i] - m1[12][i];
m2[13][i] = m1[9][i] - m1[13][i];
m2[14][i] = m1[10][i] - m1[14][i];
m2[15][i] = m1[11][i] - m1[15][i];
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m1[4][i] = m2[4][i] + m2[6][i];
m1[5][i] = m2[5][i] + m2[7][i];
m1[6][i] = m2[4][i] - m2[6][i];
m1[7][i] = m2[5][i] - m2[7][i];
m1[8][i] = m2[8][i] + m2[10][i];
m1[9][i] = m2[9][i] + m2[11][i];
m1[10][i] = m2[8][i] - m2[10][i];
m1[11][i] = m2[9][i] - m2[11][i];
m1[12][i] = m2[12][i] + m2[14][i];
m1[13][i] = m2[13][i] + m2[15][i];
m1[14][i] = m2[12][i] - m2[14][i];
m1[15][i] = m2[13][i] - m2[15][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
m2[4][i] = m1[4][i] + m1[5][i];
m2[5][i] = m1[4][i] - m1[5][i];
m2[6][i] = m1[6][i] + m1[7][i];
m2[7][i] = m1[6][i] - m1[7][i];
m2[8][i] = m1[8][i] + m1[9][i];
m2[9][i] = m1[8][i] - m1[9][i];
m2[10][i] = m1[10][i] + m1[11][i];
m2[11][i] = m1[10][i] - m1[11][i];
m2[12][i] = m1[12][i] + m1[13][i];
m2[13][i] = m1[12][i] - m1[13][i];
m2[14][i] = m1[14][i] + m1[15][i];
m2[15][i] = m1[14][i] - m1[15][i];
}
for (i = 0; i < 16; i++)
{
for (j = 0; j < 8; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(16.0 * 8) * 2);
return sad;
}
static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
int k, i, j, jj, sad = 0;
int diff[32], m1[8][4], m2[8][4];
for (k = 0; k < 32; k += 4)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 8; j++)
{
jj = j << 2;
m2[j][0] = diff[jj] + diff[jj + 2];
m2[j][1] = diff[jj + 1] + diff[jj + 3];
m2[j][2] = diff[jj] - diff[jj + 2];
m2[j][3] = diff[jj + 1] - diff[jj + 3];
m1[j][0] = m2[j][0] + m2[j][1];
m1[j][1] = m2[j][0] - m2[j][1];
m1[j][2] = m2[j][2] + m2[j][3];
m1[j][3] = m2[j][2] - m2[j][3];
}
//vertical
for (i = 0; i < 4; i++)
{
m2[0][i] = m1[0][i] + m1[4][i];
m2[1][i] = m1[1][i] + m1[5][i];
m2[2][i] = m1[2][i] + m1[6][i];
m2[3][i] = m1[3][i] + m1[7][i];
m2[4][i] = m1[0][i] - m1[4][i];
m2[5][i] = m1[1][i] - m1[5][i];
m2[6][i] = m1[2][i] - m1[6][i];
m2[7][i] = m1[3][i] - m1[7][i];
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m1[4][i] = m2[4][i] + m2[6][i];
m1[5][i] = m2[5][i] + m2[7][i];
m1[6][i] = m2[4][i] - m2[6][i];
m1[7][i] = m2[5][i] - m2[7][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
m2[4][i] = m1[4][i] + m1[5][i];
m2[5][i] = m1[4][i] - m1[5][i];
m2[6][i] = m1[6][i] + m1[7][i];
m2[7][i] = m1[6][i] - m1[7][i];
}
for (i = 0; i < 8; i++)
{
for (j = 0; j < 4; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(4.0 * 8) * 2);
return sad;
}
static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
int k, i, j, jj, sad = 0;
int diff[32], m1[4][8], m2[4][8];
for (k = 0; k < 32; k += 8)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
diff[k + 4] = piOrg[4] - piCur[4];
diff[k + 5] = piOrg[5] - piCur[5];
diff[k + 6] = piOrg[6] - piCur[6];
diff[k + 7] = piOrg[7] - piCur[7];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 4; j++)
{
jj = j << 3;
m2[j][0] = diff[jj] + diff[jj + 4];
m2[j][1] = diff[jj + 1] + diff[jj + 5];
m2[j][2] = diff[jj + 2] + diff[jj + 6];
m2[j][3] = diff[jj + 3] + diff[jj + 7];
m2[j][4] = diff[jj] - diff[jj + 4];
m2[j][5] = diff[jj + 1] - diff[jj + 5];
m2[j][6] = diff[jj + 2] - diff[jj + 6];
m2[j][7] = diff[jj + 3] - diff[jj + 7];
m1[j][0] = m2[j][0] + m2[j][2];
m1[j][1] = m2[j][1] + m2[j][3];
m1[j][2] = m2[j][0] - m2[j][2];
m1[j][3] = m2[j][1] - m2[j][3];
m1[j][4] = m2[j][4] + m2[j][6];
m1[j][5] = m2[j][5] + m2[j][7];
m1[j][6] = m2[j][4] - m2[j][6];
m1[j][7] = m2[j][5] - m2[j][7];
m2[j][0] = m1[j][0] + m1[j][1];
m2[j][1] = m1[j][0] - m1[j][1];
m2[j][2] = m1[j][2] + m1[j][3];
m2[j][3] = m1[j][2] - m1[j][3];
m2[j][4] = m1[j][4] + m1[j][5];
m2[j][5] = m1[j][4] - m1[j][5];
m2[j][6] = m1[j][6] + m1[j][7];
m2[j][7] = m1[j][6] - m1[j][7];
}
//vertical
for (i = 0; i < 8; i++)
{
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
}
for (i = 0; i < 4; i++)
{
for (j = 0; j < 8; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(4.0 * 8) * 2);
return sad;
}
static uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
{
const uvg_pixel* piOrg = ref_in;
const uvg_pixel* piCur = pred_in;
const int iRows = height;
const int iCols = width;
const int iStrideOrg = ref_stride;
const int iStrideCur = pred_stride;
int x = 0, y = 0;
uint64_t uiSum = 0;
if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0)
{
for (y = 0; y < iRows; y += 8)
{
for (x = 0; x < iCols; x += 16)
{
uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 8;
piCur += iStrideCur * 8;
}
}
else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0)
{
for (y = 0; y < iRows; y += 16)
{
for (x = 0; x < iCols; x += 8)
{
uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 16;
piCur += iStrideCur * 16;
}
}
else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0)
{
for (y = 0; y < iRows; y += 4)
{
for (x = 0; x < iCols; x += 8)
{
uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 4;
piCur += iStrideCur * 4;
}
}
else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0)
{
for (y = 0; y < iRows; y += 8)
{
for (x = 0; x < iCols; x += 4)
{
uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 8;
piCur += iStrideCur * 8;
}
}
else if ((iRows % 8 == 0) && (iCols % 8 == 0))
{
for (y = 0; y < iRows; y += 8)
{
for (x = 0; x < iCols; x += 8)
{
uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
}
piOrg += 8 * iStrideOrg;
piCur += 8 * iStrideCur;
}
}
else if ((iRows % 4 == 0) && (iCols % 4 == 0))
{
for (y = 0; y < iRows; y += 4)
{
for (x = 0; x < iCols; x += 4)
{
uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
}
piOrg += 4 * iStrideOrg;
piCur += 4 * iStrideCur;
}
}
else if ((iRows % 2 == 0) && (iCols % 2 == 0))
{
for (y = 0; y < iRows; y += 2)
{
for (x = 0; x < iCols; x += 2)
{
uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += 2 * iStrideOrg;
piCur += 2 * iStrideCur;
}
}
// TODO: 10 bit
return (uiSum >> 0);
}
// Function macro for defining SAD calculating functions
// for fixed size blocks.
#define SAD_NXN(n, pixel_type) \
@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel)
static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec,
const int ref_stride, const int rec_stride,
const int width)
const int width, const int height)
{
int ssd = 0;
int y, x;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
ssd += diff * diff;
@ -783,10 +1355,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)
static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual,
int width, int ref_stride, int pred_stride)
int width, int height, int ref_stride, int pred_stride)
{
int y, x;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
}
@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs);
success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);

View file

@ -44,7 +44,6 @@
#include "fast_coeff_cost.h"
#include "reshape.h"
#define QUANT_SHIFT 14
/**
* \brief quantize transformed coefficents
*
@ -62,22 +61,28 @@ void uvg_quant_generic(
uint8_t lfnst_idx)
{
const encoder_control_t * const encoder = state->encoder_control;
const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2;
const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1];
const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];
const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height);
int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
uint32_t log2_tr_width = uvg_math_floor_log2(height);
uint32_t log2_tr_height = uvg_math_floor_log2(width);
bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
needs_block_size_trafo_scale |= 0; // Non log2 block size
const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift );
const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
const int32_t q_bits8 = q_bits - 8;
const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6];
uint32_t ac_sum = 0;
const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF;
if(lfnst_idx == 0){
for (int32_t n = 0; n < width * height; n++) {
int32_t level = coef[n];
@ -86,7 +91,7 @@ void uvg_quant_generic(
sign = (level < 0 ? -1 : 1);
int32_t curr_quant_coeff = quant_coeff[n];
int32_t curr_quant_coeff = use_scaling_list ? quant_coeff[n] : default_quant_coeff;
level = (int32_t)((abs_level * curr_quant_coeff + add) >> q_bits);
ac_sum += level;
@ -237,6 +242,7 @@ int uvg_quant_cbcr_residual_generic(
encoder_state_t* const state,
const cu_info_t* const cur_cu,
const int width,
const int height,
const coeff_scan_order_t scan_order,
const int in_stride, const int out_stride,
const uvg_pixel* const u_ref_in,
@ -247,28 +253,28 @@ int uvg_quant_cbcr_residual_generic(
uvg_pixel* v_rec_out,
coeff_t* coeff_out,
bool early_skip,
int lmcs_chroma_adj, enum uvg_tree_type tree_type
) {
int lmcs_chroma_adj, enum uvg_tree_type tree_type)
{
ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
// TODO: this function is not fully converted to handle non-square blocks
{
int y, x;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
}
}
}
uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride);
uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride);
const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
for (int y = 0; y < width; y++)
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
@ -305,33 +311,44 @@ int uvg_quant_cbcr_residual_generic(
}
uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
if(cur_cu->cr_lfnst_idx) {
uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
uint8_t lfnst_idx = tree_type == UVG_CHROMA_T ? cur_cu->cr_lfnst_idx : cur_cu->lfnst_idx;
if(lfnst_idx) {
uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
}
if (state->encoder_control->cfg.rdoq_enable &&
int abs_sum = 0;
if (!false && state->encoder_control->cfg.dep_quant) {
uvg_dep_quant(
state,
cur_cu,
width,
height,
coeff,
coeff_out,
COLOR_U,
tree_type,
&abs_sum,
state->encoder_control->cfg.scaling_list);
}
else if (state->encoder_control->cfg.rdoq_enable &&
(width > 4 || !state->encoder_control->cfg.rdoq_skip))
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
cur_cu->cr_lfnst_idx);
uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx, 0);
}
else if (state->encoder_control->cfg.rdoq_enable && false) {
uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
scan_order);
}
else {
uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->lfnst_idx);
uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, lfnst_idx);
}
int8_t has_coeffs = 0;
{
int i;
for (i = 0; i < width * width; ++i) {
for (i = 0; i < width * height; ++i) {
if (coeff_out[i] != 0) {
has_coeffs = 1;
break;
@ -342,13 +359,13 @@ int uvg_quant_cbcr_residual_generic(
if (has_coeffs && !early_skip) {
// Get quantized residual. (coeff_out -> coeff -> residual)
uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
if (cur_cu->cr_lfnst_idx) {
uvg_inv_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type);
if (lfnst_idx) {
uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode);
}
uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
//if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@ -371,7 +388,7 @@ int uvg_quant_cbcr_residual_generic(
//}
const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
// Get quantized reconstruction. (residual + pred_in -> rec_out)
for (int y = 0; y < width; y++) {
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
if (temp == 2) {
u_residual[x + y * width] = combined_residual[x + y * width];
@ -400,7 +417,7 @@ int uvg_quant_cbcr_residual_generic(
}
}
}
for (int y = 0; y < width; ++y) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val);
@ -413,7 +430,7 @@ int uvg_quant_cbcr_residual_generic(
// With no coeffs and rec_out == pred_int we skip copying the coefficients
// because the reconstruction is just the prediction.
for (int y = 0; y < width; ++y) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
@ -441,7 +458,7 @@ int uvg_quant_cbcr_residual_generic(
* \returns Whether coeff_out contains any non-zero coefficients.
*/
int uvg_quantize_residual_generic(encoder_state_t *const state,
const cu_info_t *const cur_cu, const int width, const color_t color,
const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
const coeff_scan_order_t scan_order, const int use_trskip,
const int in_stride, const int out_stride,
const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@ -454,19 +471,19 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
int has_coeffs = 0;
assert(width <= TR_MAX_WIDTH);
assert(width >= TR_MIN_WIDTH);
const int height = width; // TODO: height for non-square blocks
// With ISP these checks no longer apply, since width and height 2 is now possible
// With MTT even 1x16 and 16x1 ISP splits are possible
//assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH);
//assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH);
// Get residual. (ref_in - pred_in -> residual)
uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);
uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride);
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
int y, x;
int sign, absval;
int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
sign = residual[x + y * width] >= 0 ? 1 : -1;
absval = sign * residual[x + y * width];
@ -477,43 +494,54 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
// Transform residual. (residual -> coeff)
if (use_trskip) {
uvg_transformskip(state->encoder_control, residual, coeff, width);
uvg_transformskip(state->encoder_control, residual, coeff, width, height);
}
else {
uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
}
const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
const uint8_t lfnst_index = tree_type != UVG_CHROMA_T || color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx;
if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
// Forward low frequency non-separable transform
uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
}
// Quantize coeffs. (coeff -> coeff_out)
if (state->encoder_control->cfg.rdoq_enable &&
int abs_sum = 0;
if (!use_trskip && state->encoder_control->cfg.dep_quant) {
uvg_dep_quant(
state,
cur_cu,
width,
height,
coeff,
coeff_out,
color,
tree_type,
&abs_sum,
state->encoder_control->cfg.scaling_list);
}
else if (state->encoder_control->cfg.rdoq_enable &&
(width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip)
{
int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
uvg_rdoq(state, coeff, coeff_out, width, width, color,
scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
lfnst_index);
uvg_rdoq(state, coeff, coeff_out, width, height, color,
scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0);
} else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
uvg_ts_rdoq(state, coeff, coeff_out, width, width, color,
uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
scan_order);
} else {
uvg_quant(state, coeff, coeff_out, width, width, color,
uvg_quant(state, coeff, coeff_out, width, height, color,
scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index);
}
// Check if there are any non-zero coefficients.
{
int i;
for (i = 0; i < width * width; ++i) {
for (i = 0; i < width * height; ++i) {
if (coeff_out[i] != 0) {
has_coeffs = 1;
break;
@ -527,25 +555,25 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
int y, x;
// Get quantized residual. (coeff_out -> coeff -> residual)
uvg_dequant(state, coeff_out, coeff, width, width, color,
uvg_dequant(state, coeff_out, coeff, width, height, color,
cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) {
// Inverse low frequency non-separable transform
uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type);
uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode);
}
if (use_trskip) {
uvg_itransformskip(state->encoder_control, residual, coeff, width);
uvg_itransformskip(state->encoder_control, residual, coeff, width, height);
}
else {
uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu);
uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu);
}
if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
int y, x;
int sign, absval;
int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
sign = residual[x + y * width] >= 0 ? 1 : -1;
@ -561,7 +589,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
}
// Get quantized reconstruction. (residual + pred_in -> rec_out)
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val);
@ -573,7 +601,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
// because the reconstruction is just the prediction.
int y, x;
for (y = 0; y < width; ++y) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
}
@ -590,23 +618,29 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
{
const encoder_control_t * const encoder = state->encoder_control;
if(encoder->cfg.dep_quant && !transform_skip) {
uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list);
return;
}
int32_t shift,add,coeff_q;
int32_t n;
int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
const uint32_t log2_tr_width = uvg_g_convert_to_log2[width];
const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
needs_block_size_trafo_scale |= 0; // Non log2 block size
int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
if (encoder->scaling_list.enable)
{
uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2;
uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2;
int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6];
const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6];
shift += 4;
if (shift >qp_scaled / 6) {
@ -624,10 +658,10 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
}
}
} else {
int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
add = 1 << (shift-1);
for (n = 0; n < width*height; n++) {
for (n = 0; n < width * height; n++) {
coeff_q = (q_coef[n] * scale + add) >> shift;
coef[n] = (coeff_t)CLIP(-32768, 32767, coeff_q);
}
@ -651,14 +685,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
weights[3] = (wts_packed >> 48) & 0xffff;
}
static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights)
{
assert((width == height) && "Non-square block handling not implemented for this function.");
uint32_t sum = 0;
uint16_t weights_unpacked[4];
get_coeff_weights(weights, weights_unpacked);
for (int32_t i = 0; i < width * width; i++) {
for (int32_t i = 0; i < width * height; i++) {
int16_t curr = coeff[i];
uint32_t curr_abs = abs(curr);
if (curr_abs > 3) {

View file

@ -44,8 +44,6 @@
#include "uvg266.h"
#include "tables.h"
#define QUANT_SHIFT 14
int uvg_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
void uvg_quant_generic(
const encoder_state_t * const state,
@ -60,7 +58,7 @@ void uvg_quant_generic(
uint8_t lfnst_idx);
int uvg_quantize_residual_generic(encoder_state_t *const state,
const cu_info_t *const cur_cu, const int width, const color_t color,
const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
const coeff_scan_order_t scan_order, const int use_trskip,
const int in_stride, const int out_stride,
const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
@ -71,6 +69,7 @@ int uvg_quant_cbcr_residual_generic(
encoder_state_t* const state,
const cu_info_t* const cur_cu,
const int width,
const int height,
const coeff_scan_order_t scan_order,
const int in_stride, const int out_stride,
const uvg_pixel* const u_ref_in,

View file

@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0;
dct_func * uvg_dct_8x8 = 0;
dct_func * uvg_dct_16x16 = 0;
dct_func * uvg_dct_32x32 = 0;
dct_func * uvg_dct_non_square = 0;
dct_func * uvg_fast_inverse_dst_4x4 = 0;
@ -56,16 +57,19 @@ void(*uvg_mts_dct)(int8_t bitdepth,
color_t color,
const cu_info_t *tu,
int8_t width,
int8_t height,
const int16_t *input,
int16_t *output,
const int8_t mts_idx);
const int8_t mts_type);
void(*uvg_mts_idct)(int8_t bitdepth,
color_t color,
const cu_info_t *tu,
int8_t width,
int8_t height,
const int16_t *input,
int16_t *output,
const int8_t mts_idx);
const int8_t mts_type);
int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
@ -90,8 +94,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) {
*
* \returns Pointer to the function.
*/
dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
{
if (width != height) {
// Non-square block. Return generic dct for non-square blokcs.
assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function.");
//return uvg_dct_non_square;
}
switch (width) {
case 4:
//if (color == COLOR_Y && type == CU_INTRA) {
@ -119,8 +128,13 @@ dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type)
*
* \returns Pointer to the function.
*/
dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type)
dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type)
{
if (width != height) {
// Non-square block. Return generic dct for non-square blokcs.
assert(false && "This should never be called at this point. Non-square stuff is done inside mts_idct function.");
//return uvg_idct_non_square;
}
switch (width) {
case 4:
//if (color == COLOR_Y && type == CU_INTRA) {

View file

@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4;
extern dct_func * uvg_dct_8x8;
extern dct_func * uvg_dct_16x16;
extern dct_func * uvg_dct_32x32;
extern dct_func * uvg_dct_non_square;
extern dct_func * uvg_fast_inverse_dst_4x4;
@ -64,9 +65,10 @@ typedef void (mts_dct_func)(
color_t color,
const cu_info_t* tu,
int8_t width,
int8_t height,
const int16_t* input,
int16_t* output,
const int8_t mts_idx);
const int8_t mts_type);
extern mts_dct_func* uvg_mts_dct;
@ -75,15 +77,16 @@ typedef void (mts_idct_func)(
color_t color,
const cu_info_t* tu,
int8_t width,
int8_t height,
const int16_t* input,
int16_t* output,
const int8_t mts_idx);
const int8_t mts_type);
extern mts_idct_func* uvg_mts_idct;
int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth);
dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type);
dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type);
dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type);
dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type);

View file

@ -0,0 +1,55 @@
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
#include "strategies/strategies-depquant.h"
#include "strategies/avx2/depquant-avx2.h"
#include "strategies/generic/depquant-generic.h"
#include "strategyselector.h"
// Define function pointers.
dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
{
bool success = true;
success &= uvg_strategy_register_depquant_generic(opaque, bitdepth);
if (uvg_g_hardware_flags.intel_flags.avx2) {
success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth);
}
return success;
}

View file

@ -0,0 +1,88 @@
#ifndef STRATEGIES_DEPQUANT_H_
#define STRATEGIES_DEPQUANT_H_
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Interface for sao functions.
*/
#include "encoder.h"
#include "encoderstate.h"
#include "global.h" // IWYU pragma: keep
#include "uvg266.h"
#include "dep_quant.h"
// Declare function pointers.
typedef int(dep_quant_decide_and_update_func)(
rate_estimator_t* re,
context_store* ctxs,
struct dep_quant_scan_info const* const scan_info,
const coeff_t absCoeff,
const uint32_t scan_pos,
const uint32_t width_in_sbb,
const uint32_t height_in_sbb,
const NbInfoSbb next_nb_info_ssb,
bool zeroOut,
coeff_t quantCoeff,
const uint32_t effWidth,
const uint32_t effHeight,
bool is_chroma);
typedef void (find_first_non_zero_coeff_func)(
const coeff_t* srcCoeff,
const bool enableScalingLists,
const context_store* const dep_quant_context,
const uint32_t* const scan,
const int32_t* q_coeff,
int* firstTestPos,
int width,
int height);
// Declare function pointers.
extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
#define STRATEGIES_DEPQUANT_EXPORTS \
{"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
{"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \
#endif //STRATEGIES_DEPQUANT_H_

View file

@ -49,7 +49,7 @@
typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
cabac_data_t * const cabac,
const coeff_t *coeff,
uint8_t width,
const cu_loc_t * const loc,
uint8_t color,
int8_t scan_mode,
cu_info_t* cur_cu,

View file

@ -38,22 +38,26 @@
* Interface for intra prediction functions.
*/
#include "cu.h"
#include "global.h" // IWYU pragma: keep
#include "intra.h"
#include "uvg266.h"
typedef void (angular_pred_func)(
const int_fast8_t log2_width,
const cu_loc_t* const cu_loc,
const int_fast8_t intra_mode,
const int_fast8_t channel_type,
const uvg_pixel *const in_ref_above,
const uvg_pixel *const in_ref_left,
uvg_pixel *const dst,
const uint8_t multi_ref_idx);
const uint8_t multi_ref_idx,
const uint8_t isp_mode,
const int cu_dim);
typedef void (intra_pred_planar_func)(
const int_fast8_t log2_width,
const cu_loc_t* const cu_loc,
color_t color,
const uvg_pixel *const ref_top,
const uvg_pixel *const ref_left,
uvg_pixel *const dst);
@ -67,8 +71,8 @@ typedef void (intra_pred_filtered_dc_func)(
typedef void (pdpc_planar_dc_func)(
const int mode,
const int width,
const int log2_width,
const cu_loc_t* const cu_loc,
const color_t color,
const uvg_intra_ref *const used_ref,
uvg_pixel *const dst);

View file

@ -37,6 +37,7 @@
#include "strategies/generic/picture-generic.h"
#include "strategies/sse2/picture-sse2.h"
#include "strategies/sse41/picture-sse41.h"
#include "strategies/sse42/picture-sse42.h"
#include "strategyselector.h"
@ -70,6 +71,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0;
cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0;
cost_pixel_any_size_func * uvg_satd_any_size = 0;
cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0;
cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0;
pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0;
@ -115,13 +117,14 @@ int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) {
/**
* \brief Get a function that calculates SATD for NxN block.
*
* \param n Width of the region for which SATD is calculated.
* \param width Width of the region for which SATD is calculated.
*
* \returns Pointer to cost_16bit_nxn_func.
*/
cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n)
cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned width, unsigned height)
{
switch (n) {
if(width == height) {
switch (width) {
case 4:
return uvg_satd_4x4;
case 8:
@ -135,19 +138,22 @@ cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n)
default:
return NULL;
}
}
return NULL;
}
/**
* \brief Get a function that calculates SAD for NxN block.
*
* \param n Width of the region for which SAD is calculated.
* \param width Width of the region for which SAD is calculated.
*
* \returns Pointer to cost_16bit_nxn_func.
*/
cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n)
cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned width, unsigned height)
{
switch (n) {
if(width == height) {
switch (width) {
case 4:
return uvg_sad_4x4;
case 8:
@ -161,18 +167,22 @@ cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n)
default:
return NULL;
}
}
return NULL;
}
/**
* \brief Get a function that calculates SATDs for 2 NxN blocks.
*
* \param n Width of the region for which SATD is calculated.
* \param width Width of the region for which SATD is calculated.
* \param height Height of the region for which SATD is calculated.
*
* \returns Pointer to cost_pixel_nxn_multi_func.
*/
cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n)
cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height)
{
switch (n) {
if(width == height) {
switch (width) {
case 4:
return uvg_satd_4x4_dual;
case 8:
@ -186,19 +196,22 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n)
default:
return NULL;
}
}
return NULL;
}
/**
* \brief Get a function that calculates SADs for 2 NxN blocks.
*
* \param n Width of the region for which SAD is calculated.
* \param width Width of the region for which SAD is calculated.
*
* \returns Pointer to cost_pixel_nxn_multi_func.
*/
cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n)
cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height)
{
switch (n) {
if(width == height) {
switch (width) {
case 4:
return uvg_sad_4x4_dual;
case 8:
@ -212,6 +225,8 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n)
default:
return NULL;
}
}
return NULL;
}
// Precomputed CRC32C lookup table for polynomial 0x04C11DB7

View file

@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)(
typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out);
typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height);
typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
int32_t block_width, int32_t block_height,
@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);
typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);
typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride);
extern const uint32_t uvg_crc_table[256];
@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16;
extern cost_pixel_nxn_func * uvg_satd_32x32;
extern cost_pixel_nxn_func * uvg_satd_64x64;
extern cost_pixel_any_size_func *uvg_satd_any_size;
extern cost_pixel_any_size_func *uvg_satd_any_size_vtm;
extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual;
extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual;
@ -203,8 +204,8 @@ extern pixel_var_func *uvg_pixel_var;
extern generate_residual_func* uvg_generate_residual;
int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth);
cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n);
cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height);
cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height);
#define STRATEGIES_PICTURE_EXPORTS \
{"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \
@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n);
{"satd_32x32", (void**) &uvg_satd_32x32}, \
{"satd_64x64", (void**) &uvg_satd_64x64}, \
{"satd_any_size", (void**) &uvg_satd_any_size}, \
{"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \
{"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \
{"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \
{"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \

View file

@ -46,7 +46,8 @@ coeff_abs_sum_func *uvg_coeff_abs_sum;
fast_coeff_cost_func *uvg_fast_coeff_cost;
int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) {
int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth)
{
bool success = true;
success &= uvg_strategy_register_quant_generic(opaque, bitdepth);

View file

@ -45,12 +45,23 @@
#include "tables.h"
// Declare function pointers.
typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx);
typedef unsigned (quant_func)(
const encoder_state_t * const state,
coeff_t *coef,
coeff_t *q_coef,
int32_t width,
int32_t height,
color_t color,
int8_t scan_idx,
int8_t block_type,
int8_t transform_skip,
uint8_t lfnst_idx);
typedef unsigned (quant_cbcr_func)(
encoder_state_t* const state,
const cu_info_t* const cur_cu,
const int width,
const int height,
const coeff_scan_order_t scan_order,
const int in_stride, const int out_stride,
const uvg_pixel* const u_ref_in,
@ -63,16 +74,19 @@ typedef unsigned (quant_cbcr_func)(
bool early_skip,
int lmcs_chroma_adj,
enum uvg_tree_type tree_type);
typedef unsigned (quant_residual_func)(encoder_state_t *const state,
const cu_info_t *const cur_cu, const int width, const color_t color,
const cu_info_t *const cur_cu, const int width, const int height, const color_t color,
const coeff_scan_order_t scan_order, const int use_trskip,
const int in_stride, const int out_stride,
const uvg_pixel *const ref_in, const uvg_pixel *const pred_in,
uvg_pixel *rec_out, coeff_t *coeff_out,
bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type);
typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights);
typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

View file

@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
fprintf(stderr, "uvg_strategy_register_encode failed!\n");
return 0;
}
if (!uvg_strategy_register_depquant(&strategies, bitdepth)) {
fprintf(stderr, "uvg_strategy_register_depquant failed!\n");
return 0;
}
while(cur_strategy_to_select->fptr) {
*(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);

View file

@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st
#include "strategies/strategies-intra.h"
#include "strategies/strategies-sao.h"
#include "strategies/strategies-encode.h"
#include "strategies/strategies-depquant.h"
#include "strategies/strategies-alf.h"
static const strategy_to_select_t strategies_to_select[] = {
@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = {
STRATEGIES_SAO_EXPORTS
STRATEGIES_ENCODE_EXPORTS
STRATEGIES_ALF_EXPORTS
STRATEGIES_DEPQUANT_EXPORTS
{ NULL, NULL },
};

File diff suppressed because it is too large Load diff

View file

@ -134,6 +134,15 @@ typedef enum
*/
extern const uint32_t* const uvg_g_sig_last_scan[3][5];
extern const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1];
extern const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1];
extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2];
#define SCAN_GROUP_TYPES 2
#define MAX_LOG2_INDEX 7
#define SCAN_GROUP_UNGROUPED 0
#define SCAN_GROUP_4X4 1
const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h);
#endif //TABLES_H_

File diff suppressed because it is too large Load diff

View file

@ -44,23 +44,28 @@
#include "global.h" // IWYU pragma: keep
extern const uint8_t uvg_g_chroma_scale[58];
extern const int16_t uvg_g_inv_quant_scales[6];
extern const int16_t uvg_g_quant_scales[6];
extern const int16_t uvg_g_inv_quant_scales[2][6];
extern const int16_t uvg_g_quant_scales[2][6];
void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
#define COEFF_ORDER_LINEAR 0
#define COEFF_ORDER_CU 1
void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height);
void uvg_transform2d(const encoder_control_t * const encoder,
int16_t *block,
int16_t *coeff,
int8_t block_size,
int8_t block_width,
int8_t block_height,
color_t color,
const cu_info_t *tu);
void uvg_itransform2d(const encoder_control_t * const encoder,
int16_t *block,
int16_t *coeff,
int8_t block_size,
int8_t block_width,
int8_t block_height,
color_t color,
const cu_info_t *tu);
@ -69,11 +74,12 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con
void uvg_derive_lfnst_constraints(
cu_info_t* const pred_cu,
const int depth,
bool* constraints,
const coeff_t* coeff,
const int width,
const int height);
const int height,
const vector2d_t * const ,
color_t color);
typedef struct {
double best_u_cost;
@ -82,6 +88,10 @@ typedef struct {
int best_u_index;
int best_v_index;
int best_combined_index;
uint64_t u_distortion;
uint64_t v_distortion;
double u_bits;
double v_bits;
} uvg_chorma_ts_out_t;
void uvg_quantize_lcu_residual(
@ -89,9 +99,7 @@ void uvg_quantize_lcu_residual(
bool luma,
bool chroma,
const bool jccr,
int32_t x,
int32_t y,
uint8_t depth,
const cu_loc_t* cu_loc,
cu_info_t *cur_cu,
lcu_t* lcu,
bool early_skip,
@ -99,13 +107,10 @@ void uvg_quantize_lcu_residual(
void uvg_chroma_transform_search(
encoder_state_t* const state,
int depth,
lcu_t* const lcu,
cabac_data_t* temp_cabac,
int8_t width,
int8_t height,
const cu_loc_t* const cu_loc,
const int offset,
const uint8_t mode,
cu_info_t* pred_cu,
uvg_pixel u_pred[1024],
uvg_pixel v_pred[1024],
@ -130,7 +135,8 @@ void uvg_fwd_lfnst(
const color_t color,
const uint16_t lfnst_idx,
coeff_t *coeffs,
enum uvg_tree_type tree_type);
enum uvg_tree_type tree_type,
int8_t luma_mode);
void uvg_inv_lfnst(
const cu_info_t* cur_cu,
@ -139,6 +145,7 @@ void uvg_inv_lfnst(
const color_t color,
const uint16_t lfnst_idx,
coeff_t* coeffs,
enum uvg_tree_type tree_type);
enum uvg_tree_type tree_type,
int8_t luma_mode);
#endif

View file

@ -338,7 +338,6 @@ typedef struct uvg_config
int32_t trskip_max_size; /*!< \brief Transform skip max block size. */
enum uvg_mts mts; /*< \brief flag to enable multiple transform selection*/
int32_t mts_implicit; /*< \brief flag to enable implicit multiple transform selection*/
int32_t tr_depth_intra; /*!< \brief Maximum transform depth for intra. */
enum uvg_ime_algorithm ime_algorithm; /*!< \brief Integer motion estimation algorithm. */
int32_t fme_level; /*!< \brief Fractional pixel motion estimation level (0: disabled, 1: enabled). */
int8_t source_scan_type; /*!< \brief Source scan type (0: progressive, 1: top field first, 2: bottom field first).*/
@ -526,6 +525,8 @@ typedef struct uvg_config
/** \brief enable low frequency non-separable transform */
int8_t lfnst;
/** \brief enable intra sub partitions*/
int8_t isp;
int8_t jccr;
@ -542,9 +543,16 @@ typedef struct uvg_config
uint8_t dual_tree;
uint8_t min_qt_size[3]; /* intra, inter, dual tree chroma*/
uint8_t max_bt_size[3]; /* intra, inter, dual tree chroma*/
uint8_t max_tt_size[3]; /* intra, inter, dual tree chroma*/
uint8_t max_btt_depth[3]; /* intra, inter, dual tree chroma*/
uint8_t intra_rough_search_levels;
uint8_t ibc; /* \brief Intra Block Copy parameter */
uint8_t dep_quant;
} uvg_config;
/**

View file

@ -61,7 +61,7 @@ videoframe_t * uvg_videoframe_alloc(int32_t width,
frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
if (cclm) {
assert(chroma_format == UVG_CSP_420);
frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 15) & ~7) + FRAME_PADDING_LUMA) / 4);
frame->cclm_luma_rec_top_line = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64));
}
}

View file

@ -30,7 +30,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
with open(state_file, "rb") as file:
try:
while True:
type_, x, y, depth, tree_type = file.read(15).decode().split()
type_, x, y, depth, tree_type = file.read(23).decode().split()
# Reset stored data at the beginning of the frame
if x == '0' and y == '0' and type_ == "S" and tree_type != "2":
if not was_zero_last:
@ -38,7 +38,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int
ctx_store = dict()
e_store = set()
was_zero_last = True
else:
elif int(x) >= 64 and int(y) >= 64:
was_zero_last = False
ctx = file.read(ctx_count * ctx_size)

View file

@ -111,7 +111,8 @@ static void setup_tests()
tu.tr_idx = MTS_DST7_DST7 + trafo;
tu.lfnst_idx = 0;
tu.cr_lfnst_idx = 0;
mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
tu.intra.isp_mode = 0;
mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH);
}
}
}
@ -134,7 +135,8 @@ static void setup_tests()
tu.tr_idx = MTS_DST7_DST7 + trafo;
tu.lfnst_idx = 0;
tu.cr_lfnst_idx = 0;
idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
tu.intra.isp_mode = 0;
idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH);
}
}
@ -156,6 +158,7 @@ TEST dct(void)
{
char testname[100];
for (int blocksize = 0; blocksize < NUM_SIZES; blocksize++) {
size_t size = 1 << (LCU_MIN_LOG_W + blocksize);
for (int trafo = 0; trafo < NUM_TRANSFORM; trafo++) {
sprintf(testname, "Block: %d x %d, trafo: %d", 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), trafo);
cu_info_t tu;
@ -163,15 +166,21 @@ TEST dct(void)
tu.tr_idx = MTS_DST7_DST7 + trafo;
tu.lfnst_idx = 0;
tu.cr_lfnst_idx = 0;
tu.intra.isp_mode = 0;
int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
for (int y = 0; y < size; ++y) {
if (y>= 16) break;
for (int x = 0; x < size; ++x) {
if (x >= 16) break;
int i = y * size + x;
ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]);
}
}
//fprintf(stderr, "PASS: %s\r\n", testname);
}
}
@ -188,11 +197,14 @@ TEST idct(void)
cu_info_t tu;
tu.type = CU_INTRA;
tu.tr_idx = MTS_DST7_DST7 + trafo;
tu.lfnst_idx = 0;
tu.cr_lfnst_idx = 0;
tu.intra.isp_mode = 0;
int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize];
ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 };
test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH);
for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) {
ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]);

View file

@ -46,8 +46,11 @@ TEST test_get_spatial_merge_cand(void)
merge_candidates_t cand = { 0 };
get_spatial_merge_candidates(64 + 32, 64, // x, y
32, 24, // width, height
cu_loc_t cu_loc;
uvg_cu_loc_ctor(&cu_loc, 64 + 32, 64, // x, y
32, 24); // width, height)
get_spatial_merge_candidates(&cu_loc,
1920, 1080, // picture size
&lcu,
&cand,

View file

@ -6,10 +6,10 @@ set -eu
cabacfile="$(mktemp)"
valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
python3 check_cabac_state_consistency.py "${cabacfile}"
valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
python3 check_cabac_state_consistency.py "${cabacfile}"
rm -rf "${cabacfile}"

View file

@ -19,3 +19,5 @@ valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
valgrind_test $common_args --rd=3 --cclm --jccr
valgrind_test $common_args --lfnst
valgrind_test $common_args --lfnst --rd=3 --cclm --mip --dual-tree --fast-residual-cost 0
valgrind_test $common_args --rd=2 --isp --cpuid=0 --fast-residual-cost 0
valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra --fast-residual-cost 0

14
tests/test_mtt.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/sh
# Test all-intra coding.
set -eu
. "${0%/*}/util.sh"
common_args='264x130 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-cpuid --no-wpp --fast-residual-cost 0'
valgrind_test $common_args --rd=0 --mtt-depth-intra 1 --pu-depth-intra 2-3
valgrind_test $common_args --rd=3 --mtt-depth-intra 1 --pu-depth-intra 0-5
valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --pu-depth-intra 0-8
valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8
valgrind_test $common_args --rd=3 --rdoq --jccr --isp --lfnst --mip --mrl --mts intra --cclm --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8

View file

@ -51,7 +51,7 @@ static void init_sig_last_scan(uint32_t *buff_d, uint32_t *buff_h,
uint32_t *buff_v,
int32_t width, int32_t height)
{
uint32_t num_scan_pos = width * width;
uint32_t num_scan_pos = width * height;
uint32_t next_scan_pos = 0;
int32_t xx, yy, x, y;
uint32_t scan_line;